This article will do some try & error on models : Logistic , LDA, Ridge, Lasso with ovr & ovo check. No optimisation will be applied, however I will conduct some basic 10 fold cross validation and split(80%,20%) so as to increase the accurancy a bit to see different results .
Data set from kaggle : https://www.kaggle.com/c/digit-recognizer
1. read kaggle data
%%time import numpy as np import pandas as pd from sklearn import datasets from sklearn import metrics from sklearn.cross_validation import train_test_split dataset = pd.read_csv("train.csv") target = dataset[[0]].values.ravel() train = dataset.iloc[:,1:].values test = pd.read_csv("test.csv").values # cross validation : training set 80% ,test set:20% x_train,x_test, y_train, y_test = train_test_split(train,target,train_size=0.8,test_size=0.2) x_finaltest = test # x = input , y = output final test set X print(x_train.shape) print(y_train.shape) print (x_test.shape) print (y_test.shape) print(x_finaltest.shape)
2. Logistic OvO
%%time from sklearn.multiclass import OneVsOneClassifier from sklearn.linear_model import LogisticRegression model1 = OneVsOneClassifier(LogisticRegression()) model1.fit(x_train, y_train) # print accurancy y_predict1 = model1.predict(x_test) print (metrics.accuracy_score(y_predict1,y_test)) y_pred1 = model1.predict(x_finaltest) np.savetxt('logistic_ovo_2.csv', np.c_[range(1,len(x_finaltest)+1),y_pred1], delimiter=',', header = 'ImageId,Label', comments = '', fmt='%d')
3. Logistic OvR
%%time from sklearn.multiclass import OneVsRestClassifier from sklearn.linear_model import LogisticRegression model4 = OneVsRestClassifier(LogisticRegression()) model4.fit(x_train, y_train) # print accurancy y_predict4 = model4.predict(x_test) print (metrics.accuracy_score(y_predict4,y_test)) y_pred4 = model4.predict(x_finaltest) np.savetxt('logistic_ovr_2.csv', np.c_[range(1,len(x_finaltest)+1),y_pred4], delimiter=',', header = 'ImageId,Label', comments = '', fmt='%d')
4. LDA OvR
%%time from sklearn.lda import LDA model2 = OneVsRestClassifier(LDA()) model2.fit(x_train,y_train) # print accurancy y_predict2 = model2.predict(x_test) print (metrics.accuracy_score(y_predict2,y_test)) y_pred2 = model2.predict(x_finaltest) np.savetxt('lda_ovr_2.csv', np.c_[range(1,len(x_finaltest)+1),y_pred2], delimiter=',', header = 'ImageId,Label', comments = '', fmt='%d')
5. LDA OvO
%%time from sklearn.lda import LDA model3 = OneVsOneClassifier(LDA()) model3.fit(x_train,y_train) # print accurancy y_predict3 = model3.predict(x_test) print (metrics.accuracy_score(y_predict3,y_test)) y_pred3 = model3.predict(x_finaltest) np.savetxt('lda_ovo_2.csv', np.c_[range(1,len(x_finaltest)+1),y_pred3], delimiter=',', header = 'ImageId,Label', comments = '', fmt='%d')
6. Ridge OvO
from sklearn.linear_model import Ridge model5 = OneVsOneClassifier(Ridge(alpha=1.0)) # 0. 0.1, 2 model5.fit(x_train,y_train) # print accurancy to find the best alpha y_predict5 = model5.predict(x_test) print (metrics.accuracy_score(y_predict5,y_test)) #final predict y_pred5 = model5.predict(x_finaltest) np.savetxt('ridge_ovo_2.csv', np.c_[range(1,len(x_finaltest)+1),y_pred5], delimiter=',', header = 'ImageId,Label', comments = '', fmt='%d')
7. Ridge OvR
from sklearn.linear_model import Ridge model6 = OneVsRestClassifier(Ridge(alpha=1.0)) # 0. 0.1, 2 model6.fit(x_train,y_train) # print accurancy to find the best alpha y_predict6 = model6.predict(x_test) print (metrics.accuracy_score(y_predict6,y_test)) #final predict y_pred6 = model6.predict(x_finaltest) np.savetxt('ridge_ovr_2.csv', np.c_[range(1,len(x_finaltest)+1),y_pred6], delimiter=',', header = 'ImageId,Label', comments = '', fmt='%d')
8. Lasso OvO
from sklearn.linear_model import Lasso model7 = OneVsOneClassifier(Lasso(alpha=0.1)) # 0. 0.1, 2 model7.fit(x_train,y_train) # print accurancy to find the best alpha y_predict7 = model7.predict(x_test) print (metrics.accuracy_score(y_predict7,y_test)) # final predict y_pred7 = model7.predict(x_finaltest) np.savetxt('Lasso_ovo_2.csv', np.c_[range(1,len(x_finaltest)+1),y_pred7], delimiter=',', header = 'ImageId,Label', comments = '', fmt='%d')
9. Lasso OvR
from sklearn.linear_model import Lasso model8 = OneVsRestClassifier(Lasso(alpha=0.1)) # 0. 0.1, 2 model8.fit(x_train,y_train) # print accurancy to find the best alpha y_predict8 = model8.predict(x_test) print (metrics.accuracy_score(y_predict8,y_test)) # final predict y_pred8 = model8.predict(x_finaltest) np.savetxt('Lasso_ovr_2.csv', np.c_[range(1,len(x_finaltest)+1),y_pred8], delimiter=',', header = 'ImageId,Label', comments = '', fmt='%d')
10 Folds
1. read kaggle data
%%time import numpy as np import pandas as pd from sklearn.lda import LDA from sklearn import datasets from sklearn import metrics from sklearn.linear_model import Ridge from sklearn.model_selection import KFold from sklearn.linear_model import Lasso from sklearn.multiclass import OneVsOneClassifier from sklearn.multiclass import OneVsRestClassifier from sklearn.linear_model import LogisticRegression dataset = pd.read_csv("train.csv") target = dataset[[0]].values.ravel() train = dataset.iloc[:,1:].values test = pd.read_csv("test.csv").values # test set x_finaltest = test kf = KFold(n_splits = 10)
2. Logistic OvO
%%time total_score1 = [] model1 = OneVsOneClassifier(LogisticRegression()) for train_index ,test_index in kf.split(train): x_train = train[train_index] y_train = target[train_index] x_test = train[test_index] y_test = target[test_index] # print ("train: {0} test:{1}".format(train_index,test_index)) #train the model model1.fit(x_train, y_train) # test model y_predict1 = model1.predict(x_test) # print accurancy #print (metrics.accuracy_score(y_predict1,y_test)) total_score1.append(metrics.accuracy_score(y_predict1,y_test)) #平均分 print(np.mean(total_score1)) y_pred1 = model1.predict(x_finaltest) np.savetxt('logistic_ovo_3.csv', np.c_[range(1,len(x_finaltest)+1),y_pred1], delimiter=',', header = 'ImageId,Label', comments = '', fmt='%d')
2. Logistic OvR
%%time total_score2 = [] model2 = OneVsRestClassifier(LogisticRegression()) for train_index ,test_index in kf.split(train): x_train = train[train_index] y_train = target[train_index] x_test = train[test_index] y_test = target[test_index] #print ("train: {0} test:{1}".format(train_index,test_index)) #train the model model2.fit(x_train, y_train) # test model y_predict2 = model2.predict(x_test) # print accurancy #print (metrics.accuracy_score(y_predict2,y_test)) total_score2.append(metrics.accuracy_score(y_predict2,y_test)) print(np.mean(total_score2)) y_pred2 = model2.predict(x_finaltest) np.savetxt('logistic_ovr_3.csv', np.c_[range(1,len(x_finaltest)+1),y_pred2], delimiter=',', header = 'ImageId,Label', comments = '', fmt='%d')
The most accurate model is known as CNN , i will demostrate in another article in future
No comments:
Post a Comment