SI 670 Kaggle Competition, Team 18

Data preparation

In [ ]:
# Import libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest 
from sklearn.feature_selection import chi2
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import RidgeClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.svm import LinearSVC
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from scipy.sparse import hstack
In [ ]:
# Read datasets and Preprocess
train=pd.read_csv("train.csv")
test=pd.read_csv("test.csv")

train['ind'] = train['references'].map(lambda x: x!=x)*1
train.loc[train['ind']==1,'references'] = ''
test['ind'] = test['references'].map(lambda x: x!=x)*1
test.loc[test['ind']==1,'references'] = ''

train['tit_len']=train['title'].map(lambda x: len(x.split()))
train['abs_len']=train['abstract'].map(lambda x: len(x.split()))
train['ref_len']=train['references'].map(lambda x: len(x.split()))
test['tit_len']=test['title'].map(lambda x: len(x.split()))
test['abs_len']=test['abstract'].map(lambda x: len(x.split()))
test['ref_len']=test['references'].map(lambda x: len(x.split()))

Method 1

Preprocessing

In [ ]:
## Train test split
X_train, X_test, y_train, y_test = train_test_split(train.loc[:,train.columns!='label'], train['label'], test_size=0.2, random_state=10)
In [ ]:
vectorizer = TfidfVectorizer(token_pattern=r'\b\w+\b',sublinear_tf=True,min_df=5)
tit_train_train=vectorizer.fit_transform(X_train["title"])
tit_train_test=vectorizer.transform(X_test["title"])

vectorizer = TfidfVectorizer(token_pattern=r'\b\w+\b',sublinear_tf=True,min_df=5)
abs_train_train=vectorizer.fit_transform(X_train["abstract"])
abs_train_test=vectorizer.transform(X_test["abstract"])

vectorizer = TfidfVectorizer(token_pattern=r'\b\w+\b',sublinear_tf=True,min_df=5)
ref_train_train=vectorizer.fit_transform(X_train["references"])
ref_train_test=vectorizer.transform(X_test["references"])
In [ ]:
## Concatenate the corresponding columns from title, abstract and references
tit_abs_ref_train=hstack((tit_train_train,abs_train_train,ref_train_train))
tit_abs_ref_test=hstack((tit_train_test,abs_train_test,ref_train_test))
X_train = hstack((tit_abs_ref_train,X_train[['year','n_citation','ind','tit_len','abs_len','ref_len']].reset_index(drop=True)))
X_test = hstack((tit_abs_ref_test,X_test[['year','n_citation','ind','tit_len','abs_len','ref_len']].reset_index(drop=True)))
y_train.reset_index(drop=True, inplace=True)
y_test.reset_index(drop=True, inplace=True)

Model and parameter choosing

In [ ]:
## use cross-validation to choose alpha
for alpha in [4,5,6,7]:
    for tol in [0.0001,0.001,0.01,0.1,1]:
        clf = RidgeClassifier(alpha=alpha,normalize=True,solver="sparse_cg",tol=tol)
        cvs = cross_val_score(clf, X_train, y_train, cv=5)
        print([cvs,np.mean(cvs)])
In [ ]:
clf = RidgeClassifier(alpha=5,normalize=True,solver="sparse_cg",tol=0.1).fit(X_train,y_train)
print(clf.score(X_train,y_train))
print(clf.score(X_test, y_test))
In [ ]:
## bagging:
ridge_lst = []
for i in range(10):
    tryind = np.random.choice(X_train.shape[0],X_train.shape[0])
    clf = RidgeClassifier(alpha=5,normalize=True,solver="sparse_cg",tol=0.1).fit(X_train.tocsr()[tryind,:],y_train[tryind])
    ridge_lst.append(clf.predict(X_test))
np.mean(pd.DataFrame(ridge_lst).T.mode(axis=1)[0]==y_test)
In [ ]:
##random:
ridge_lst = []
for i in range(10):
    trycol = np.random.choice(X_train.shape[1],20000)
    tryrow = np.random.choice(X_train.shape[0],X_train.shape[0])
    clf = RidgeClassifier(alpha=5,normalize=True,solver="sparse_cg",tol=0.1).fit(X_train.tocsr()[:,trycol][tryrow,:],y_train[tryrow])
    ridge_lst.append(clf.predict(X_test.tocsr()[:,trycol]))
np.mean(pd.DataFrame(ridge_lst).T.mode(axis=1)[0]==y_test)

Prediction

In [ ]:
vectorizer = TfidfVectorizer(token_pattern=r'\b\w+\b',sublinear_tf=True,min_df=5)
tit_train=vectorizer.fit_transform(train["title"])
tit_test=vectorizer.transform(test["title"])

vectorizer = TfidfVectorizer(token_pattern=r'\b\w+\b',sublinear_tf=True,min_df=5)
abs_train=vectorizer.fit_transform(train["abstract"])
abs_test=vectorizer.transform(test["abstract"])

vectorizer = TfidfVectorizer(token_pattern=r'\b\w+\b',sublinear_tf=True,min_df=5)
ref_train=vectorizer.fit_transform(train["references"])
ref_test=vectorizer.transform(test["references"])
In [ ]:
tit_abs_ref_train0=hstack((tit_train,abs_train,ref_train))
tit_abs_ref_test0=hstack((tit_test,abs_test,ref_test))
In [ ]:
## Generate the final datasets
train_x_new = hstack((train[['year','n_citation','ind','tit_len','abs_len','ref_len']],tit_abs_ref_train0))
train_y = train['label']
test_x_new = hstack((test[['year','n_citation','ind','tit_len','abs_len','ref_len']],tit_abs_ref_test0))
In [ ]:
clf = RidgeClassifier(alpha=5,normalize=True,solver="sparse_cg",tol=0.1).fit(train_x_new,train_y)
print(clf.score(train_x_new,train_y))
In [ ]:
## Predict and Output
pd.DataFrame({'Id': range(29809),'Category': clf.predict(test_x_new)}).to_csv("output11.csv", index=False)

Method 2

In [ ]:
## Train test split
X_train, X_test, y_train, y_test = train_test_split(train.loc[:,train.columns!='label'], train['label'], test_size=0.2, random_state=10)

Preprocessing

In [ ]:
## Tf-idf encoding
vectorizer = TfidfVectorizer(token_pattern=r'\b\w+\b',max_features=4645,sublinear_tf=True,min_df=5) #ngram_range=(1,3)
tit_train_train=pd.DataFrame(vectorizer.fit_transform(X_train["title"]).toarray())
tit_train_train.columns=[x+'_tit' for x in vectorizer.get_feature_names()]
tit_train_test=pd.DataFrame(vectorizer.transform(X_test['title']).toarray())
tit_train_test.columns=tit_train_train.columns

vectorizer = TfidfVectorizer(token_pattern=r'\b\w+\b',max_features=15000,sublinear_tf=True,min_df=5)
abs_train_train=pd.DataFrame(vectorizer.fit_transform(X_train["abstract"]).toarray())
abs_train_train.columns=[x+'_abs' for x in vectorizer.get_feature_names()]
abs_train_test=pd.DataFrame(vectorizer.transform(X_test['abstract']).toarray())
abs_train_test.columns=abs_train_train.columns

vectorizer = TfidfVectorizer(token_pattern=r'\b\w+\b',max_features=15000,sublinear_tf=True,min_df=5)
ref_train_train=pd.DataFrame(vectorizer.fit_transform(X_train["references"]).toarray())
ref_train_train.columns=[x+'_ref' for x in vectorizer.get_feature_names()]
ref_train_test=pd.DataFrame(vectorizer.transform(X_test['references']).toarray())
ref_train_test.columns=ref_train_train.columns
In [ ]:
## Concatenate the corresponding columns from title, abstract and references
tit_abs_ref_train=pd.concat([tit_train_train,abs_train_train,ref_train_train],axis=1)
tit_abs_ref_test=pd.concat([tit_train_test,abs_train_test,ref_train_test],axis=1)
## Chi2 select
chi2_f = SelectKBest(score_func=chi2, k = 300) 
X_kbest = chi2_f.fit(tit_abs_ref_train,y_train)
ind_tit_abs_ref=[X_kbest.scores_.tolist().index(x) for x in sorted(X_kbest.scores_,reverse=True)[0:30000]]
In [ ]:
## Select 30000 variables with the best 30000 chi2 and Generate the needed datasets
X_train = pd.concat([X_train[['year','n_citation','ind','tit_len','abs_len','ref_len']].reset_index(drop=True),tit_abs_ref_train.iloc[:,ind_tit_abs_ref]],axis=1)
X_test = pd.concat([X_test[['year','n_citation','ind','tit_len','abs_len','ref_len']].reset_index(drop=True),tit_abs_ref_test.iloc[:,ind_tit_abs_ref]],axis=1)
y_train.reset_index(drop=True, inplace=True)
y_test.reset_index(drop=True, inplace=True)
In [ ]:
## Select all the variables and Generate the needed datasets
X_train = pd.concat([X_train[['year','n_citation','ind','tit_len','abs_len','ref_len']].reset_index(drop=True),tit_abs_ref_train],axis=1)
X_test = pd.concat([X_test[['year','n_citation','ind','tit_len','abs_len','ref_len']].reset_index(drop=True),tit_abs_ref_test],axis=1)
y_train.reset_index(drop=True, inplace=True)
y_test.reset_index(drop=True, inplace=True)
In [ ]:
## Minmax-scale the datasets for some methods
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

Model & Parameter choosing

In [ ]:
## Ridge: Compare cross validation score for datasets with small p
for i in [2,3,4,5]:
    ridge = RidgeClassifier(alpha=i,normalize=True)
    cvs = cross_val_score(ridge, X_train, y_train, cv=5)
    print([cvs,np.mean(cvs)])
In [ ]:
## Ridge: Compare one-split score for datasets with large p
for i in [1.5,2,3,4,5,6,7]:
    clf = RidgeClassifier(alpha=i,normalize=True).fit(X_train,y_train)
    print(clf.score(X_train,y_train))
    print(clf.score(X_test, y_test))
## Records ## 8000: alpha=1.5 0.9300121608588082,0.8906407245890641 alpha=2 0.9258187612697614,0.8911439114391144 alpha=3 0.9206608797752338,0.8918148272391815 alpha=4 0.9173900280957773,0.8916470982891647 ## 9000: alpha=2 0.9303056988300415,0.8916470982891647 alpha=3 0.9246446093848283,0.891479369339148 ## 15000: alpha=1.5 0.9552983603807607,0.8898020798389802 alpha=2 0.9494276009560951,0.8918148272391815 alpha=3 0.9403698578437539,0.8934921167393493 alpha=3.6 0.9367635341971736,0.8934921167393493 alpha=4 0.9346668344026502,0.8934921167393493 alpha=4.1 0.9343313624355265,0.8933243877893324 alpha=4.5 0.9329894745670315,0.8926534719892654 alpha=5 0.9307250387889462,0.8918148272391815 ## 25000: alpha=4 0.9544596804629513 0.8956725930895673 alpha=5 0.9488405250136286 0.8960080509896008 alpha=6 0.9447729274122532 0.8950016772895002 alpha=7 0.941544009728687 0.8950016772895002 ## 30000: alpha=4 0.9617142617520024 0.8958403220395841 alpha=5 0.95613704029857 0.8961757799396176 alpha=6 0.9526565186396612 0.8960080509896008 alpha=7 0.9486727890300667 0.8948339483394834 ## 35000: alpha=5 0.9623432716903594 0.8960080509896008 alpha=6 0.9584434100725459 0.8960080509896008 alpha=7 0.9546274164465132 0.8948339483394834
In [ ]:
## bagging
ridge_lst = []
for i in range(3):
    tryind = np.random.choice(X_train.shape[0],X_train.shape[0])
    clf = RidgeClassifier(alpha=2,normalize=True).fit(X_train.iloc[tryind,:],y_train[tryind])
    print(clf.score(X_test, y_test))
    ridge_lst.append(clf.predict(X_test))
np.mean(pd.DataFrame(ridge_lst).T.mode(axis=1)[0]==y_test)
In [ ]:
## random
ridge_lst = []
for i in range(50):
    trycol = np.random.choice(X_train.shape[1],5000)
    tryrow = np.random.choice(X_train.shape[0],X_train.shape[0])
    clf = RidgeClassifier(alpha=2,normalize=True).fit(X_train.iloc[tryrow,trycol],y_train[tryrow])
    #print(clf.score(X_test.iloc[:,trycol], y_test))
    ridge_lst.append(clf.predict(X_test.iloc[:,trycol]))
np.mean(pd.DataFrame(ridge_lst).T.mode(axis=1)[0]==y_test)
In [ ]:
## Try other methods, not as ideal as Ridge
### Linear SVC
lclf = LinearSVC(C=0.01).fit(X_train_scaled, y_train)
lclf.score(X_train_scaled, y_train),lclf.score(X_test_scaled, y_test)
### SGD
sgd = SGDClassifier(alpha=0.002,penalty='l2',loss='log').fit(X_train_scaled, y_train)
sgd.score(X_train_scaled, y_train),sgd.score(X_test_scaled, y_test)
### Logistic
lr = LogisticRegression(C=1).fit(X_train_scaled, y_train)
lr.score(X_train_scaled, y_train),lr.score(X_test_scaled, y_test)
### Multinomial Naive Bayes
mnb=MultinomialNB(alpha=0.5).fit(X_train_scaled, y_train)
mnb.score(X_train_scaled, y_train),mnb.score(X_test_scaled, y_test)

Prediction

In [ ]:
## Tf-idf encoding
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(token_pattern=r'\b\w+\b',max_features=5000,sublinear_tf=True,min_df=5)
tit_train=pd.DataFrame(vectorizer.fit_transform(train["title"]).toarray())
tit_train.columns=[x+'_tit' for x in vectorizer.get_feature_names()]
tit_test=pd.DataFrame(vectorizer.transform(test['title']).toarray())
tit_test.columns=tit_train.columns

from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(token_pattern=r'\b\w+\b',max_features=15000,sublinear_tf=True,min_df=5) #
abs_train=pd.DataFrame(vectorizer.fit_transform(train["abstract"]).toarray())
abs_train.columns=[x+'_abs' for x in vectorizer.get_feature_names()]
abs_test=pd.DataFrame(vectorizer.transform(test['abstract']).toarray())
abs_test.columns=abs_train.columns

from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(token_pattern=r'\b\w+\b',max_features=15000,sublinear_tf=True,min_df=5)
ref_train=pd.DataFrame(vectorizer.fit_transform(train["references"]).toarray())
ref_train.columns=[x+'_ref' for x in vectorizer.get_feature_names()]
ref_test=pd.DataFrame(vectorizer.transform(test['references']).toarray())
ref_test.columns=ref_train.columns
In [ ]:
## Concatenate the corresponding columns from title, abstract and references
tit_abs_ref_train0=pd.concat([tit_train,abs_train,ref_train],axis=1)
tit_abs_ref_test0=pd.concat([tit_test,abs_test,ref_test],axis=1)
## chi2 select
chi2_f = SelectKBest(score_func=chi2, k = 300) 
X_kbest = chi2_f.fit(tit_abs_ref_train0,train['label'])
ind_tit_abs_ref0=[X_kbest.scores_.tolist().index(x) for x in sorted(X_kbest.scores_,reverse=True)[0:30000]]
In [ ]:
## Generate the final datasets
train_x_new = pd.concat([train[['year','n_citation','ind','tit_len','abs_len','ref_len']],tit_abs_ref_train0.iloc[:,ind_tit_abs_ref0]],axis=1)
train_y = train['label']
test_x_new = pd.concat([test[['year','n_citation','ind','tit_len','abs_len','ref_len']],tit_abs_ref_test0.iloc[:,ind_tit_abs_ref0]],axis=1)
In [ ]:
## Fit model using the parameter chosen before
clf = RidgeClassifier(alpha=5,normalize=True).fit(train_x_new,train_y)
print(clf.score(train_x_new,train_y))
In [ ]:
## Predict and Output
pd.DataFrame({'Id': range(29809),'Category': clf.predict(test_x_new)}).to_csv("output10.csv", index=False)
In [ ]: