In [11]:
# Feature Extraction with Univariate Statistical Tests (Chi-squared for classification) 
from pandas import read_csv 
from numpy import set_printoptions 
from sklearn.feature_selection import SelectKBest 
from sklearn.feature_selection import chi2 

# load data 
filename = 'pima-indians-diabetes.data.csv' 
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class'] 
dataframe = read_csv(filename, names=names) 
array = dataframe.values 
X = array[:,0:8] 
Y = array[:,8] 

# feature extraction 
test = SelectKBest(score_func=chi2, k=4) 
fit = test.fit(X, Y) 

# summarize scores 
set_printoptions(precision=3) 
print(fit.scores_) 
features = fit.transform(X) 

# summarize selected features 
print(features[0:5,:]) 

# Best 4 attributes are names[4], names[1], names[7], names[5]
print("\nBest 4 attributes (in order):")
for i in [4,1,7,5]:
    print(names[i])

[ 111.52  1411.887   17.605   53.108 2175.565  127.669    5.393  181.304]
[[148.    0.   33.6  50. ]
 [ 85.    0.   26.6  31. ]
 [183.    0.   23.3  32. ]
 [ 89.   94.   28.1  21. ]
 [137.  168.   43.1  33. ]]

Best 4 attributes (in order):
test
plas
age
mass


In [13]:
# Feature Extraction with RFE 
from pandas import read_csv 
from sklearn.feature_selection import RFE 
from sklearn.linear_model import LogisticRegression 

# load data 
filename = 'pima-indians-diabetes.data.csv' 
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class'] 
dataframe = read_csv(filename, names=names) 
array = dataframe.values 
X = array[:,0:8] 
Y = array[:,8] 

# feature extraction 
model = LogisticRegression() 
rfe = RFE(model, 3) 
fit = rfe.fit(X, Y) 
print("Num Features: %d" % fit.n_features_) 
print("Selected Features: %s" % fit.support_) 
print("Feature Ranking: %s" % fit.ranking_)

# Top 3 features are names[0], names[5], names[6]
print("\nTop 3 features:")
for i in [0,5,6]:
    print(names[i])

Num Features: 3
Selected Features: [ True False False False False  True  True False]
Feature Ranking: [1 2 3 5 6 1 1 4]

Top 3 features:
preg
mass
pedi




In [14]:
# Feature Extraction with PCA 
from pandas import read_csv 
from sklearn.decomposition import PCA 

# load data 
filename = 'pima-indians-diabetes.data.csv' 
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class'] 
dataframe = read_csv(filename, names=names) 
array = dataframe.values 
X = array[:,0:8] 
Y = array[:,8] 

# feature extraction 
pca = PCA(n_components=3) 
fit = pca.fit(X) 

# summarize components 
print("Explained Variance: %s" % fit.explained_variance_ratio_) 
print(fit.components_) 

Explained Variance: [0.889 0.062 0.026]
[[-2.022e-03  9.781e-02  1.609e-02  6.076e-02  9.931e-01  1.401e-02
   5.372e-04 -3.565e-03]
 [-2.265e-02 -9.722e-01 -1.419e-01  5.786e-02  9.463e-02 -4.697e-02
  -8.168e-04 -1.402e-01]
 [-2.246e-02  1.434e-01 -9.225e-01 -3.070e-01  2.098e-02 -1.324e-01
  -6.400e-04 -1.255e-01]]


In [15]:
# Feature Importance with Extra Trees Classifier 
from pandas import read_csv 
from sklearn.ensemble import ExtraTreesClassifier 

# load data 
filename = 'pima-indians-diabetes.data.csv' 
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class'] 
dataframe = read_csv(filename, names=names) 
array = dataframe.values 
X = array[:,0:8] 
Y = array[:,8] 

# feature extraction 
model = ExtraTreesClassifier() 
model.fit(X, Y) 
print(model.feature_importances_)

[0.112 0.26  0.099 0.085 0.074 0.137 0.105 0.128]




In [16]:
from pandas import read_csv 
from sklearn.model_selection import train_test_split 
from sklearn.linear_model import LogisticRegression 
filename = 'pima-indians-diabetes.data.csv' 
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class'] 
dataframe = read_csv(filename, names=names) 
array = dataframe.values 
X = array[:,0:8] 
Y = array[:,8] 
test_size = 0.33 
seed = 7 
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=test_size, random_state=seed) 
model = LogisticRegression() 
model.fit(X_train, Y_train) 
result = model.score(X_test, Y_test) 
print("Accuracy: %.3f%%" % (result*100.0)) 

Accuracy: 75.591%




In [17]:
from pandas import read_csv 
from sklearn.model_selection import KFold 
from sklearn.model_selection import cross_val_score 
from sklearn.linear_model import LogisticRegression 
filename = 'pima-indians-diabetes.data.csv' 
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class'] 
dataframe = read_csv(filename, names=names) 
array = dataframe.values 
X = array[:,0:8] 
Y = array[:,8] 
num_folds = 10 
seed = 7 
kfold = KFold(n_splits=num_folds, random_state=seed) 
model = LogisticRegression()
results = cross_val_score(model, X, Y, cv=kfold) 
print("Accuracy: %.3f%% (%.3f%%)" % (results.mean()*100.0, results.std()*100.0))

Accuracy: 76.951% (4.841%)




In [18]:
from pandas import read_csv 
from sklearn.model_selection import LeaveOneOut 
from sklearn.model_selection import cross_val_score 
from sklearn.linear_model import LogisticRegression 
filename = 'pima-indians-diabetes.data.csv' 
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class'] 
dataframe = read_csv(filename, names=names) 
array = dataframe.values 
X = array[:,0:8] 
Y = array[:,8] 
loocv = LeaveOneOut() 
model = LogisticRegression() 
results = cross_val_score(model, X, Y, cv=loocv) 
print("Accuracy: %.3f%% (%.3f%%)" % (results.mean()*100.0, results.std()*100.0))




Accuracy: 76.953% (42.113%)




In [19]:
# Evaluate using Shuffle Split Cross Validation 
from pandas import read_csv 
from sklearn.model_selection import ShuffleSplit 
from sklearn.model_selection import cross_val_score 
from sklearn.linear_model import LogisticRegression 
filename = 'pima-indians-diabetes.data.csv' 
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class'] 
dataframe = read_csv(filename, names=names) 
array = dataframe.values 
X = array[:,0:8] 
Y = array[:,8] 
n_splits = 10 
test_size = 0.33 
seed = 7 
kfold = ShuffleSplit(n_splits=n_splits, test_size=test_size, random_state=seed) 
model = LogisticRegression() 
results = cross_val_score(model, X, Y, cv=kfold) 
print("Accuracy: %.3f%% (%.3f%%)" % (results.mean()*100.0, results.std()*100.0))


Accuracy: 76.496% (1.698%)


