Thursday, July 13, 2017

Digit Recognition Basic Models Intro


This article will do some try & error on models : Logistic , LDA, Ridge, Lasso with ovr & ovo check. No optimisation will be applied, however I will conduct some basic 10 fold cross validation and split(80%,20%) so as to increase the accurancy a bit to see different results .

Data set from kagglehttps://www.kaggle.com/c/digit-recognizer


1. read kaggle data


%%time

import numpy as np
import pandas as pd
from sklearn import datasets 
from sklearn import metrics
from sklearn.cross_validation import train_test_split

dataset = pd.read_csv("train.csv")
target = dataset[[0]].values.ravel()
train = dataset.iloc[:,1:].values
test = pd.read_csv("test.csv").values

# cross validation : training set 80% ,test set:20%
x_train,x_test, y_train, y_test = train_test_split(train,target,train_size=0.8,test_size=0.2)

x_finaltest = test # x = input , y = output final test set X

print(x_train.shape)
print(y_train.shape)
print (x_test.shape)
print (y_test.shape)

print(x_finaltest.shape)

2. Logistic OvO


%%time

from sklearn.multiclass import OneVsOneClassifier
from sklearn.linear_model import LogisticRegression

model1 = OneVsOneClassifier(LogisticRegression())
model1.fit(x_train, y_train)

# print accurancy
y_predict1 = model1.predict(x_test)
print (metrics.accuracy_score(y_predict1,y_test))

y_pred1 = model1.predict(x_finaltest)

np.savetxt('logistic_ovo_2.csv', np.c_[range(1,len(x_finaltest)+1),y_pred1], delimiter=',', 
           header = 'ImageId,Label', comments = '', fmt='%d')

3. Logistic OvR


%%time
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression

model4 = OneVsRestClassifier(LogisticRegression())
model4.fit(x_train, y_train)

# print accurancy
y_predict4 = model4.predict(x_test)
print (metrics.accuracy_score(y_predict4,y_test))

y_pred4 = model4.predict(x_finaltest)

np.savetxt('logistic_ovr_2.csv', np.c_[range(1,len(x_finaltest)+1),y_pred4], delimiter=',', 
           header = 'ImageId,Label', comments = '', fmt='%d')

4. LDA OvR


%%time
from sklearn.lda import LDA
model2 = OneVsRestClassifier(LDA())
model2.fit(x_train,y_train)

# print accurancy
y_predict2 = model2.predict(x_test)
print (metrics.accuracy_score(y_predict2,y_test))

y_pred2 = model2.predict(x_finaltest)

np.savetxt('lda_ovr_2.csv', np.c_[range(1,len(x_finaltest)+1),y_pred2], delimiter=',', 
           header = 'ImageId,Label', comments = '', fmt='%d')

5. LDA OvO


%%time
from sklearn.lda import LDA
model3 = OneVsOneClassifier(LDA())
model3.fit(x_train,y_train)

# print accurancy
y_predict3 = model3.predict(x_test)
print (metrics.accuracy_score(y_predict3,y_test))

y_pred3 = model3.predict(x_finaltest)

np.savetxt('lda_ovo_2.csv', np.c_[range(1,len(x_finaltest)+1),y_pred3], delimiter=',', 
           header = 'ImageId,Label', comments = '', fmt='%d')

6. Ridge OvO


from sklearn.linear_model import Ridge

model5 = OneVsOneClassifier(Ridge(alpha=1.0)) # 0. 0.1, 2
model5.fit(x_train,y_train)

# print accurancy to find the best alpha
y_predict5 = model5.predict(x_test)
print (metrics.accuracy_score(y_predict5,y_test))

#final predict
y_pred5 = model5.predict(x_finaltest)

np.savetxt('ridge_ovo_2.csv', np.c_[range(1,len(x_finaltest)+1),y_pred5], delimiter=',', 
           header = 'ImageId,Label', comments = '', fmt='%d')

7. Ridge OvR


from sklearn.linear_model import Ridge

model6 = OneVsRestClassifier(Ridge(alpha=1.0)) # 0. 0.1, 2
model6.fit(x_train,y_train)

# print accurancy to find the best alpha
y_predict6 = model6.predict(x_test)
print (metrics.accuracy_score(y_predict6,y_test))

#final predict
y_pred6 = model6.predict(x_finaltest)

np.savetxt('ridge_ovr_2.csv', np.c_[range(1,len(x_finaltest)+1),y_pred6], delimiter=',', 
           header = 'ImageId,Label', comments = '', fmt='%d')

8. Lasso OvO



from sklearn.linear_model import Lasso

model7 = OneVsOneClassifier(Lasso(alpha=0.1)) # 0. 0.1, 2
model7.fit(x_train,y_train)

# print accurancy to find the best alpha
y_predict7 = model7.predict(x_test)
print (metrics.accuracy_score(y_predict7,y_test))

# final predict
y_pred7 = model7.predict(x_finaltest)

np.savetxt('Lasso_ovo_2.csv', np.c_[range(1,len(x_finaltest)+1),y_pred7], delimiter=',', 
           header = 'ImageId,Label', comments = '', fmt='%d')

9. Lasso OvR


from sklearn.linear_model import Lasso

model8 = OneVsRestClassifier(Lasso(alpha=0.1)) # 0. 0.1, 2
model8.fit(x_train,y_train)

# print accurancy to find the best alpha
y_predict8 = model8.predict(x_test)
print (metrics.accuracy_score(y_predict8,y_test))

# final predict
y_pred8 = model8.predict(x_finaltest)

np.savetxt('Lasso_ovr_2.csv', np.c_[range(1,len(x_finaltest)+1),y_pred8], delimiter=',', 
           header = 'ImageId,Label', comments = '', fmt='%d')

10 Folds 

1. read kaggle data


%%time

import numpy as np
import pandas as pd
from sklearn.lda import LDA
from sklearn import datasets 
from sklearn import metrics
from sklearn.linear_model import Ridge
from sklearn.model_selection import KFold
from sklearn.linear_model import Lasso
from sklearn.multiclass import OneVsOneClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression

dataset = pd.read_csv("train.csv")
target = dataset[[0]].values.ravel()
train = dataset.iloc[:,1:].values
test = pd.read_csv("test.csv").values

# test set
x_finaltest = test

kf = KFold(n_splits = 10)

2. Logistic OvO



%%time

total_score1 = []
model1 = OneVsOneClassifier(LogisticRegression())
for train_index ,test_index in kf.split(train):
    x_train = train[train_index]
    y_train = target[train_index]
    x_test  = train[test_index]
    y_test  = target[test_index]
    
    # print ("train: {0}  test:{1}".format(train_index,test_index))
    
    #train the model
    model1.fit(x_train, y_train)
    
    # test model
    y_predict1 = model1.predict(x_test)
    # print accurancy
    #print (metrics.accuracy_score(y_predict1,y_test))
    total_score1.append(metrics.accuracy_score(y_predict1,y_test))

    #平均分
print(np.mean(total_score1))

y_pred1 = model1.predict(x_finaltest)

np.savetxt('logistic_ovo_3.csv', np.c_[range(1,len(x_finaltest)+1),y_pred1], delimiter=',', 
           header = 'ImageId,Label', comments = '', fmt='%d')

2. Logistic OvR


%%time

total_score2 = []
model2 = OneVsRestClassifier(LogisticRegression())
for train_index ,test_index in kf.split(train):
    x_train = train[train_index]
    y_train = target[train_index]
    x_test  = train[test_index]
    y_test  = target[test_index]
    
    #print ("train: {0}  test:{1}".format(train_index,test_index))
    
    #train the model
    model2.fit(x_train, y_train)
    
    # test model
    y_predict2 = model2.predict(x_test)
    # print accurancy
    #print (metrics.accuracy_score(y_predict2,y_test))
    total_score2.append(metrics.accuracy_score(y_predict2,y_test))
    
print(np.mean(total_score2))
    
y_pred2 = model2.predict(x_finaltest)

np.savetxt('logistic_ovr_3.csv', np.c_[range(1,len(x_finaltest)+1),y_pred2], delimiter=',', 
           header = 'ImageId,Label', comments = '', fmt='%d')
Others will be omitted . please leave msg if you find it useful'

The most accurate model is known as CNN , i will demostrate in another article in future

No comments:

Post a Comment

Add Loading Spinner for web request.

when web page is busily loading. normally we need to add a spinner for the user to kill their waiting impatience. Here, 2 steps we need to d...