%matplotlib inline
import pandas as pd
pd.set_option('display.max_colwidth', 120)
wine_df_full = pd.read_csv('data/wine_reviews.csv')
# let us reduce down our dataset so that it more manageable.
wine_df = wine_df_full.sample(n = 10000)
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
pipeline = Pipeline([
('vectorizer' , CountVectorizer()),
('classifier' , KNeighborsClassifier())
])
parameters = {'vectorizer__max_features' : [300, 500, 700],
'classifier__n_neighbors' : [2,3, 5] }
grid = GridSearchCV(pipeline,
parameters,
return_train_score = True,
cv = 3,
n_jobs = -1,
verbose = 2)
grid.fit(wine_df['description'], wine_df['rating'])
Fitting 3 folds for each of 9 candidates, totalling 27 fits
[CV] classifier__n_neighbors=2, vectorizer__max_features=300 .........
[CV] classifier__n_neighbors=2, vectorizer__max_features=300 .........
[CV] classifier__n_neighbors=2, vectorizer__max_features=300 .........
[CV] classifier__n_neighbors=2, vectorizer__max_features=500 .........
[CV] classifier__n_neighbors=2, vectorizer__max_features=300, total= 4.3s
[CV] classifier__n_neighbors=2, vectorizer__max_features=500 .........
[CV] classifier__n_neighbors=2, vectorizer__max_features=300, total= 4.8s
[CV] classifier__n_neighbors=2, vectorizer__max_features=300, total= 4.6s
[CV] classifier__n_neighbors=2, vectorizer__max_features=500 .........
[CV] classifier__n_neighbors=2, vectorizer__max_features=700 .........
[CV] classifier__n_neighbors=2, vectorizer__max_features=500, total= 4.7s
[CV] classifier__n_neighbors=2, vectorizer__max_features=700 .........
[CV] classifier__n_neighbors=2, vectorizer__max_features=500, total= 3.2s
[CV] classifier__n_neighbors=2, vectorizer__max_features=700 .........
[CV] classifier__n_neighbors=2, vectorizer__max_features=700, total= 3.2s
[CV] classifier__n_neighbors=3, vectorizer__max_features=300 .........
[CV] classifier__n_neighbors=2, vectorizer__max_features=700, total= 3.2s
[CV] classifier__n_neighbors=3, vectorizer__max_features=300 .........
[CV] classifier__n_neighbors=2, vectorizer__max_features=500, total= 3.1s
[CV] classifier__n_neighbors=3, vectorizer__max_features=300 .........
[CV] classifier__n_neighbors=2, vectorizer__max_features=700, total= 3.0s
[CV] classifier__n_neighbors=3, vectorizer__max_features=500 .........
[CV] classifier__n_neighbors=3, vectorizer__max_features=300, total= 3.2s
[CV] classifier__n_neighbors=3, vectorizer__max_features=500 .........
[CV] classifier__n_neighbors=3, vectorizer__max_features=300, total= 3.2s
[CV] classifier__n_neighbors=3, vectorizer__max_features=500 .........
[CV] classifier__n_neighbors=3, vectorizer__max_features=300, total= 3.0s
[CV] classifier__n_neighbors=3, vectorizer__max_features=700 .........
[CV] classifier__n_neighbors=3, vectorizer__max_features=500, total= 3.3s
[CV] classifier__n_neighbors=3, vectorizer__max_features=700 .........
[CV] classifier__n_neighbors=3, vectorizer__max_features=500, total= 3.3s
[CV] classifier__n_neighbors=3, vectorizer__max_features=700 .........
[CV] classifier__n_neighbors=3, vectorizer__max_features=500, total= 3.3s
[CV] classifier__n_neighbors=5, vectorizer__max_features=300 .........
[CV] classifier__n_neighbors=3, vectorizer__max_features=700, total= 3.3s
[CV] classifier__n_neighbors=5, vectorizer__max_features=300 .........
[CV] classifier__n_neighbors=3, vectorizer__max_features=700, total= 3.4s
[CV] classifier__n_neighbors=5, vectorizer__max_features=300 .........
[CV] classifier__n_neighbors=3, vectorizer__max_features=700, total= 3.5s
[CV] classifier__n_neighbors=5, vectorizer__max_features=500 .........
[CV] classifier__n_neighbors=5, vectorizer__max_features=300, total= 3.6s
[CV] classifier__n_neighbors=5, vectorizer__max_features=500 .........
[CV] classifier__n_neighbors=5, vectorizer__max_features=300, total= 3.7s
[CV] classifier__n_neighbors=5, vectorizer__max_features=500 .........
[CV] classifier__n_neighbors=5, vectorizer__max_features=300, total= 3.7s
[CV] classifier__n_neighbors=5, vectorizer__max_features=700 .........
[CV] classifier__n_neighbors=5, vectorizer__max_features=500, total= 3.9s
[CV] classifier__n_neighbors=5, vectorizer__max_features=700 .........
[CV] classifier__n_neighbors=5, vectorizer__max_features=500, total= 3.6s
[CV] classifier__n_neighbors=5, vectorizer__max_features=500, total= 3.5s
[CV] classifier__n_neighbors=5, vectorizer__max_features=700 .........
[CV] classifier__n_neighbors=5, vectorizer__max_features=700, total= 3.4s
[CV] classifier__n_neighbors=5, vectorizer__max_features=700, total= 3.2s
[CV] classifier__n_neighbors=5, vectorizer__max_features=700, total= 2.8s
GridSearchCV(cv=3, error_score='raise',
estimator=Pipeline(memory=None,
steps=[('vectorizer', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
lowercase=True, max_df=1.0, max_features=None, min_df=1,
ngram_range=(1, 1), preprocessor=None, stop_words=None,
...owski',
metric_params=None, n_jobs=1, n_neighbors=5, p=2,
weights='uniform'))]),
fit_params=None, iid=True, n_jobs=-1,
param_grid={'vectorizer__max_features': [300, 500, 700], 'classifier__n_neighbors': [2, 3, 5]},
pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
scoring=None, verbose=2)
pd.DataFrame(grid.cv_results_).sort_values(by = 'rank_test_score')
mean_fit_time | mean_score_time | mean_test_score | mean_train_score | param_classifier__n_neighbors | param_vectorizer__max_features | params | rank_test_score | split0_test_score | split0_train_score | split1_test_score | split1_train_score | split2_test_score | split2_train_score | std_fit_time | std_score_time | std_test_score | std_train_score | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0.650011 | 3.911422 | 0.7290 | 0.95490 | 2 | 300 | {'classifier__n_neighbors': 2, 'vectorizer__max_features': 300} | 1 | 0.740552 | 0.950945 | 0.727273 | 0.958452 | 0.719172 | 0.955302 | 0.005341 | 0.207406 | 0.008814 | 0.003078 |
1 | 0.663731 | 3.014868 | 0.7212 | 0.96905 | 2 | 500 | {'classifier__n_neighbors': 2, 'vectorizer__max_features': 500} | 2 | 0.724655 | 0.967897 | 0.726373 | 0.971651 | 0.712571 | 0.967602 | 0.024517 | 0.748231 | 0.006141 | 0.001843 |
2 | 0.631577 | 2.498195 | 0.7166 | 0.97260 | 2 | 700 | {'classifier__n_neighbors': 2, 'vectorizer__max_features': 700} | 3 | 0.726155 | 0.971497 | 0.714671 | 0.972551 | 0.708971 | 0.973751 | 0.011422 | 0.080301 | 0.007147 | 0.000921 |
3 | 0.685521 | 2.450972 | 0.7035 | 0.80460 | 3 | 300 | {'classifier__n_neighbors': 3, 'vectorizer__max_features': 300} | 4 | 0.711758 | 0.804530 | 0.705071 | 0.808910 | 0.693669 | 0.800360 | 0.045753 | 0.052470 | 0.007468 | 0.003491 |
6 | 0.684377 | 2.984071 | 0.6984 | 0.75990 | 5 | 300 | {'classifier__n_neighbors': 5, 'vectorizer__max_features': 300} | 5 | 0.707558 | 0.761026 | 0.697570 | 0.764212 | 0.690069 | 0.754462 | 0.029646 | 0.070882 | 0.007164 | 0.004059 |
4 | 0.676721 | 2.596920 | 0.6824 | 0.77825 | 3 | 500 | {'classifier__n_neighbors': 3, 'vectorizer__max_features': 500} | 6 | 0.688062 | 0.780978 | 0.686169 | 0.776361 | 0.672967 | 0.777411 | 0.068777 | 0.043217 | 0.006714 | 0.001976 |
7 | 0.697600 | 2.933428 | 0.6741 | 0.72870 | 5 | 500 | {'classifier__n_neighbors': 5, 'vectorizer__max_features': 500} | 7 | 0.679964 | 0.724572 | 0.678368 | 0.732263 | 0.663966 | 0.729264 | 0.013292 | 0.168813 | 0.007195 | 0.003165 |
5 | 0.721103 | 2.678074 | 0.6727 | 0.76480 | 3 | 700 | {'classifier__n_neighbors': 3, 'vectorizer__max_features': 700} | 8 | 0.675165 | 0.764326 | 0.675968 | 0.769612 | 0.666967 | 0.760462 | 0.071410 | 0.134303 | 0.004067 | 0.003750 |
8 | 0.589188 | 2.520334 | 0.6653 | 0.71335 | 5 | 700 | {'classifier__n_neighbors': 5, 'vectorizer__max_features': 700} | 9 | 0.666167 | 0.711671 | 0.672967 | 0.716514 | 0.656766 | 0.711864 | 0.104428 | 0.161456 | 0.006642 | 0.002239 |
best_pipeline = grid.best_estimator_
best_pipeline.get_params()
{'classifier': KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
metric_params=None, n_jobs=1, n_neighbors=2, p=2,
weights='uniform'),
'classifier__algorithm': 'auto',
'classifier__leaf_size': 30,
'classifier__metric': 'minkowski',
'classifier__metric_params': None,
'classifier__n_jobs': 1,
'classifier__n_neighbors': 2,
'classifier__p': 2,
'classifier__weights': 'uniform',
'memory': None,
'steps': [('vectorizer',
CountVectorizer(analyzer='word', binary=False, decode_error='strict',
dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
lowercase=True, max_df=1.0, max_features=300, min_df=1,
ngram_range=(1, 1), preprocessor=None, stop_words=None,
strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
tokenizer=None, vocabulary=None)),
('classifier',
KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
metric_params=None, n_jobs=1, n_neighbors=2, p=2,
weights='uniform'))],
'vectorizer': CountVectorizer(analyzer='word', binary=False, decode_error='strict',
dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
lowercase=True, max_df=1.0, max_features=300, min_df=1,
ngram_range=(1, 1), preprocessor=None, stop_words=None,
strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
tokenizer=None, vocabulary=None),
'vectorizer__analyzer': 'word',
'vectorizer__binary': False,
'vectorizer__decode_error': 'strict',
'vectorizer__dtype': numpy.int64,
'vectorizer__encoding': 'utf-8',
'vectorizer__input': 'content',
'vectorizer__lowercase': True,
'vectorizer__max_df': 1.0,
'vectorizer__max_features': 300,
'vectorizer__min_df': 1,
'vectorizer__ngram_range': (1, 1),
'vectorizer__preprocessor': None,
'vectorizer__stop_words': None,
'vectorizer__strip_accents': None,
'vectorizer__token_pattern': '(?u)\\b\\w\\w+\\b',
'vectorizer__tokenizer': None,
'vectorizer__vocabulary': None}
train_prediction = best_pipeline.predict(wine_df['description'])
print(classification_report(train_prediction, wine_df['rating']))
precision recall f1-score support
High 1.00 0.90 0.95 4390
Low 0.93 1.00 0.96 5610
avg / total 0.96 0.96 0.96 10000
test_sample = wine_df_full.sample(n = 10000, replace=False)
test_prediction = best_pipeline.predict(test_sample['description'])
print(classification_report(test_prediction, test_sample['rating']))
precision recall f1-score support
High 0.60 0.77 0.68 3092
Low 0.88 0.77 0.82 6908
avg / total 0.80 0.77 0.78 10000
from sklearn.feature_extraction.text import TfidfTransformer
pipeline = Pipeline([
('vectorizer' , CountVectorizer()),
('tfidf' , TfidfTransformer()),
('classifier' , KNeighborsClassifier())
])
parameters = {'vectorizer__max_features' : [250, 300, 350],
'vectorizer__stop_words' : ['english', None],
'tfidf__use_idf' : [True, False],
'classifier__n_neighbors' : [2, 3] }
grid = GridSearchCV(pipeline,
parameters,
return_train_score = True,
cv = 3,
n_jobs = -1,
verbose = 2)
grid.fit(wine_df['description'], wine_df['rating'])
Fitting 3 folds for each of 24 candidates, totalling 72 fits
[CV] classifier__n_neighbors=2, tfidf__use_idf=True, vectorizer__max_features=250, vectorizer__stop_words=english
[CV] classifier__n_neighbors=2, tfidf__use_idf=True, vectorizer__max_features=250, vectorizer__stop_words=english
[CV] classifier__n_neighbors=2, tfidf__use_idf=True, vectorizer__max_features=250, vectorizer__stop_words=english
[CV] classifier__n_neighbors=2, tfidf__use_idf=True, vectorizer__max_features=250, vectorizer__stop_words=None
[CV] classifier__n_neighbors=2, tfidf__use_idf=True, vectorizer__max_features=250, vectorizer__stop_words=english, total= 2.3s
[CV] classifier__n_neighbors=2, tfidf__use_idf=True, vectorizer__max_features=250, vectorizer__stop_words=None
[CV] classifier__n_neighbors=2, tfidf__use_idf=True, vectorizer__max_features=250, vectorizer__stop_words=english, total= 2.3s
[CV] classifier__n_neighbors=2, tfidf__use_idf=True, vectorizer__max_features=250, vectorizer__stop_words=english, total= 2.3s
[CV] classifier__n_neighbors=2, tfidf__use_idf=True, vectorizer__max_features=250, vectorizer__stop_words=None
[CV] classifier__n_neighbors=2, tfidf__use_idf=True, vectorizer__max_features=300, vectorizer__stop_words=english
[CV] classifier__n_neighbors=2, tfidf__use_idf=True, vectorizer__max_features=250, vectorizer__stop_words=None, total= 2.8s
[CV] classifier__n_neighbors=2, tfidf__use_idf=True, vectorizer__max_features=300, vectorizer__stop_words=english
[CV] classifier__n_neighbors=2, tfidf__use_idf=True, vectorizer__max_features=300, vectorizer__stop_words=english, total= 2.6s
[CV] classifier__n_neighbors=2, tfidf__use_idf=True, vectorizer__max_features=300, vectorizer__stop_words=english
[CV] classifier__n_neighbors=2, tfidf__use_idf=True, vectorizer__max_features=300, vectorizer__stop_words=english, total= 2.3s
[CV] classifier__n_neighbors=2, tfidf__use_idf=True, vectorizer__max_features=300, vectorizer__stop_words=None
[CV] classifier__n_neighbors=2, tfidf__use_idf=True, vectorizer__max_features=250, vectorizer__stop_words=None, total= 3.2s
[CV] classifier__n_neighbors=2, tfidf__use_idf=True, vectorizer__max_features=300, vectorizer__stop_words=None
[CV] classifier__n_neighbors=2, tfidf__use_idf=True, vectorizer__max_features=250, vectorizer__stop_words=None, total= 3.1s
[CV] classifier__n_neighbors=2, tfidf__use_idf=True, vectorizer__max_features=300, vectorizer__stop_words=None
[CV] classifier__n_neighbors=2, tfidf__use_idf=True, vectorizer__max_features=300, vectorizer__stop_words=english, total= 2.4s
[CV] classifier__n_neighbors=2, tfidf__use_idf=True, vectorizer__max_features=350, vectorizer__stop_words=english
[CV] classifier__n_neighbors=2, tfidf__use_idf=True, vectorizer__max_features=300, vectorizer__stop_words=None, total= 3.0s
[CV] classifier__n_neighbors=2, tfidf__use_idf=True, vectorizer__max_features=350, vectorizer__stop_words=english
[CV] classifier__n_neighbors=2, tfidf__use_idf=True, vectorizer__max_features=300, vectorizer__stop_words=None, total= 3.0s
[CV] classifier__n_neighbors=2, tfidf__use_idf=True, vectorizer__max_features=350, vectorizer__stop_words=english
[CV] classifier__n_neighbors=2, tfidf__use_idf=True, vectorizer__max_features=300, vectorizer__stop_words=None, total= 3.0s
[CV] classifier__n_neighbors=2, tfidf__use_idf=True, vectorizer__max_features=350, vectorizer__stop_words=None
[CV] classifier__n_neighbors=2, tfidf__use_idf=True, vectorizer__max_features=350, vectorizer__stop_words=english, total= 3.7s
[CV] classifier__n_neighbors=2, tfidf__use_idf=True, vectorizer__max_features=350, vectorizer__stop_words=None
[CV] classifier__n_neighbors=2, tfidf__use_idf=True, vectorizer__max_features=350, vectorizer__stop_words=english, total= 2.4s
[CV] classifier__n_neighbors=2, tfidf__use_idf=True, vectorizer__max_features=350, vectorizer__stop_words=None
[CV] classifier__n_neighbors=2, tfidf__use_idf=True, vectorizer__max_features=350, vectorizer__stop_words=english, total= 2.4s
[CV] classifier__n_neighbors=2, tfidf__use_idf=False, vectorizer__max_features=250, vectorizer__stop_words=english
[CV] classifier__n_neighbors=2, tfidf__use_idf=True, vectorizer__max_features=350, vectorizer__stop_words=None, total= 3.0s
[CV] classifier__n_neighbors=2, tfidf__use_idf=False, vectorizer__max_features=250, vectorizer__stop_words=english
[CV] classifier__n_neighbors=2, tfidf__use_idf=True, vectorizer__max_features=350, vectorizer__stop_words=None, total= 3.1s
[CV] classifier__n_neighbors=2, tfidf__use_idf=False, vectorizer__max_features=250, vectorizer__stop_words=english
[CV] classifier__n_neighbors=2, tfidf__use_idf=False, vectorizer__max_features=250, vectorizer__stop_words=english, total= 2.4s
[CV] classifier__n_neighbors=2, tfidf__use_idf=False, vectorizer__max_features=250, vectorizer__stop_words=None
[CV] classifier__n_neighbors=2, tfidf__use_idf=True, vectorizer__max_features=350, vectorizer__stop_words=None, total= 3.0s
[CV] classifier__n_neighbors=2, tfidf__use_idf=False, vectorizer__max_features=250, vectorizer__stop_words=None
[CV] classifier__n_neighbors=2, tfidf__use_idf=False, vectorizer__max_features=250, vectorizer__stop_words=english, total= 2.4s
[CV] classifier__n_neighbors=2, tfidf__use_idf=False, vectorizer__max_features=250, vectorizer__stop_words=None
[CV] classifier__n_neighbors=2, tfidf__use_idf=False, vectorizer__max_features=250, vectorizer__stop_words=english, total= 2.5s
[CV] classifier__n_neighbors=2, tfidf__use_idf=False, vectorizer__max_features=300, vectorizer__stop_words=english
[CV] classifier__n_neighbors=2, tfidf__use_idf=False, vectorizer__max_features=250, vectorizer__stop_words=None, total= 2.8s
[CV] classifier__n_neighbors=2, tfidf__use_idf=False, vectorizer__max_features=300, vectorizer__stop_words=english
[CV] classifier__n_neighbors=2, tfidf__use_idf=False, vectorizer__max_features=250, vectorizer__stop_words=None, total= 2.7s
[CV] classifier__n_neighbors=2, tfidf__use_idf=False, vectorizer__max_features=300, vectorizer__stop_words=english
[CV] classifier__n_neighbors=2, tfidf__use_idf=False, vectorizer__max_features=250, vectorizer__stop_words=None, total= 2.8s
[CV] classifier__n_neighbors=2, tfidf__use_idf=False, vectorizer__max_features=300, vectorizer__stop_words=None
[CV] classifier__n_neighbors=2, tfidf__use_idf=False, vectorizer__max_features=300, vectorizer__stop_words=english, total= 2.5s
[CV] classifier__n_neighbors=2, tfidf__use_idf=False, vectorizer__max_features=300, vectorizer__stop_words=None
[CV] classifier__n_neighbors=2, tfidf__use_idf=False, vectorizer__max_features=300, vectorizer__stop_words=english, total= 2.5s
[CV] classifier__n_neighbors=2, tfidf__use_idf=False, vectorizer__max_features=300, vectorizer__stop_words=None
[CV] classifier__n_neighbors=2, tfidf__use_idf=False, vectorizer__max_features=300, vectorizer__stop_words=english, total= 2.5s
[CV] classifier__n_neighbors=2, tfidf__use_idf=False, vectorizer__max_features=350, vectorizer__stop_words=english
[CV] classifier__n_neighbors=2, tfidf__use_idf=False, vectorizer__max_features=300, vectorizer__stop_words=None, total= 3.0s
[CV] classifier__n_neighbors=2, tfidf__use_idf=False, vectorizer__max_features=350, vectorizer__stop_words=english
[CV] classifier__n_neighbors=2, tfidf__use_idf=False, vectorizer__max_features=300, vectorizer__stop_words=None, total= 3.3s
[CV] classifier__n_neighbors=2, tfidf__use_idf=False, vectorizer__max_features=350, vectorizer__stop_words=english
[CV] classifier__n_neighbors=2, tfidf__use_idf=False, vectorizer__max_features=300, vectorizer__stop_words=None, total= 3.7s
[CV] classifier__n_neighbors=2, tfidf__use_idf=False, vectorizer__max_features=350, vectorizer__stop_words=None
[CV] classifier__n_neighbors=2, tfidf__use_idf=False, vectorizer__max_features=350, vectorizer__stop_words=english, total= 3.2s
[CV] classifier__n_neighbors=2, tfidf__use_idf=False, vectorizer__max_features=350, vectorizer__stop_words=None
[CV] classifier__n_neighbors=2, tfidf__use_idf=False, vectorizer__max_features=350, vectorizer__stop_words=english, total= 3.0s
[CV] classifier__n_neighbors=2, tfidf__use_idf=False, vectorizer__max_features=350, vectorizer__stop_words=None
[CV] classifier__n_neighbors=2, tfidf__use_idf=False, vectorizer__max_features=350, vectorizer__stop_words=english, total= 3.1s
[CV] classifier__n_neighbors=3, tfidf__use_idf=True, vectorizer__max_features=250, vectorizer__stop_words=english
[CV] classifier__n_neighbors=2, tfidf__use_idf=False, vectorizer__max_features=350, vectorizer__stop_words=None, total= 4.1s
[CV] classifier__n_neighbors=3, tfidf__use_idf=True, vectorizer__max_features=250, vectorizer__stop_words=english
[CV] classifier__n_neighbors=2, tfidf__use_idf=False, vectorizer__max_features=350, vectorizer__stop_words=None, total= 3.9s
[CV] classifier__n_neighbors=3, tfidf__use_idf=True, vectorizer__max_features=250, vectorizer__stop_words=english
[CV] classifier__n_neighbors=2, tfidf__use_idf=False, vectorizer__max_features=350, vectorizer__stop_words=None, total= 4.1s
[CV] classifier__n_neighbors=3, tfidf__use_idf=True, vectorizer__max_features=250, vectorizer__stop_words=None
[CV] classifier__n_neighbors=3, tfidf__use_idf=True, vectorizer__max_features=250, vectorizer__stop_words=english, total= 3.8s
[CV] classifier__n_neighbors=3, tfidf__use_idf=True, vectorizer__max_features=250, vectorizer__stop_words=None
[CV] classifier__n_neighbors=3, tfidf__use_idf=True, vectorizer__max_features=250, vectorizer__stop_words=english, total= 3.5s
[CV] classifier__n_neighbors=3, tfidf__use_idf=True, vectorizer__max_features=250, vectorizer__stop_words=None
[CV] classifier__n_neighbors=3, tfidf__use_idf=True, vectorizer__max_features=250, vectorizer__stop_words=english, total= 3.3s
[CV] classifier__n_neighbors=3, tfidf__use_idf=True, vectorizer__max_features=300, vectorizer__stop_words=english
[CV] classifier__n_neighbors=3, tfidf__use_idf=True, vectorizer__max_features=250, vectorizer__stop_words=None, total= 4.5s
[CV] classifier__n_neighbors=3, tfidf__use_idf=True, vectorizer__max_features=300, vectorizer__stop_words=english
[CV] classifier__n_neighbors=3, tfidf__use_idf=True, vectorizer__max_features=250, vectorizer__stop_words=None, total= 4.6s
[CV] classifier__n_neighbors=3, tfidf__use_idf=True, vectorizer__max_features=300, vectorizer__stop_words=english
[CV] classifier__n_neighbors=3, tfidf__use_idf=True, vectorizer__max_features=300, vectorizer__stop_words=english, total= 4.7s
[CV] classifier__n_neighbors=3, tfidf__use_idf=True, vectorizer__max_features=300, vectorizer__stop_words=None
[CV] classifier__n_neighbors=3, tfidf__use_idf=True, vectorizer__max_features=250, vectorizer__stop_words=None, total= 6.5s
[CV] classifier__n_neighbors=3, tfidf__use_idf=True, vectorizer__max_features=300, vectorizer__stop_words=None
[CV] classifier__n_neighbors=3, tfidf__use_idf=True, vectorizer__max_features=300, vectorizer__stop_words=english, total= 3.6s
[CV] classifier__n_neighbors=3, tfidf__use_idf=True, vectorizer__max_features=300, vectorizer__stop_words=None
[CV] classifier__n_neighbors=3, tfidf__use_idf=True, vectorizer__max_features=300, vectorizer__stop_words=english, total= 3.6s
[CV] classifier__n_neighbors=3, tfidf__use_idf=True, vectorizer__max_features=350, vectorizer__stop_words=english
[CV] classifier__n_neighbors=3, tfidf__use_idf=True, vectorizer__max_features=300, vectorizer__stop_words=None, total= 5.0s
[CV] classifier__n_neighbors=3, tfidf__use_idf=True, vectorizer__max_features=350, vectorizer__stop_words=english
[CV] classifier__n_neighbors=3, tfidf__use_idf=True, vectorizer__max_features=300, vectorizer__stop_words=None, total= 5.4s
[CV] classifier__n_neighbors=3, tfidf__use_idf=True, vectorizer__max_features=350, vectorizer__stop_words=english
[CV] classifier__n_neighbors=3, tfidf__use_idf=True, vectorizer__max_features=350, vectorizer__stop_words=english, total= 4.0s
[CV] classifier__n_neighbors=3, tfidf__use_idf=True, vectorizer__max_features=350, vectorizer__stop_words=None
[CV] classifier__n_neighbors=3, tfidf__use_idf=True, vectorizer__max_features=300, vectorizer__stop_words=None, total= 5.0s
[CV] classifier__n_neighbors=3, tfidf__use_idf=True, vectorizer__max_features=350, vectorizer__stop_words=None
[CV] classifier__n_neighbors=3, tfidf__use_idf=True, vectorizer__max_features=350, vectorizer__stop_words=english, total= 4.0s
[CV] classifier__n_neighbors=3, tfidf__use_idf=True, vectorizer__max_features=350, vectorizer__stop_words=None
[CV] classifier__n_neighbors=3, tfidf__use_idf=True, vectorizer__max_features=350, vectorizer__stop_words=english, total= 4.0s
[CV] classifier__n_neighbors=3, tfidf__use_idf=False, vectorizer__max_features=250, vectorizer__stop_words=english
[CV] classifier__n_neighbors=3, tfidf__use_idf=True, vectorizer__max_features=350, vectorizer__stop_words=None, total= 5.2s
[CV] classifier__n_neighbors=3, tfidf__use_idf=False, vectorizer__max_features=250, vectorizer__stop_words=english
[CV] classifier__n_neighbors=3, tfidf__use_idf=True, vectorizer__max_features=350, vectorizer__stop_words=None, total= 5.2s
[CV] classifier__n_neighbors=3, tfidf__use_idf=False, vectorizer__max_features=250, vectorizer__stop_words=english
[CV] classifier__n_neighbors=3, tfidf__use_idf=True, vectorizer__max_features=350, vectorizer__stop_words=None, total= 5.6s
[CV] classifier__n_neighbors=3, tfidf__use_idf=False, vectorizer__max_features=250, vectorizer__stop_words=None
[CV] classifier__n_neighbors=3, tfidf__use_idf=False, vectorizer__max_features=250, vectorizer__stop_words=english, total= 4.6s
[CV] classifier__n_neighbors=3, tfidf__use_idf=False, vectorizer__max_features=250, vectorizer__stop_words=None
[CV] classifier__n_neighbors=3, tfidf__use_idf=False, vectorizer__max_features=250, vectorizer__stop_words=english, total= 3.8s
[CV] classifier__n_neighbors=3, tfidf__use_idf=False, vectorizer__max_features=250, vectorizer__stop_words=None
[CV] classifier__n_neighbors=3, tfidf__use_idf=False, vectorizer__max_features=250, vectorizer__stop_words=english, total= 3.8s
[CV] classifier__n_neighbors=3, tfidf__use_idf=False, vectorizer__max_features=300, vectorizer__stop_words=english
[CV] classifier__n_neighbors=3, tfidf__use_idf=False, vectorizer__max_features=250, vectorizer__stop_words=None, total= 4.4s
[CV] classifier__n_neighbors=3, tfidf__use_idf=False, vectorizer__max_features=300, vectorizer__stop_words=english
[CV] classifier__n_neighbors=3, tfidf__use_idf=False, vectorizer__max_features=250, vectorizer__stop_words=None, total= 4.1s
[CV] classifier__n_neighbors=3, tfidf__use_idf=False, vectorizer__max_features=300, vectorizer__stop_words=english
[CV] classifier__n_neighbors=3, tfidf__use_idf=False, vectorizer__max_features=250, vectorizer__stop_words=None, total= 4.0s
[CV] classifier__n_neighbors=3, tfidf__use_idf=False, vectorizer__max_features=300, vectorizer__stop_words=None
[CV] classifier__n_neighbors=3, tfidf__use_idf=False, vectorizer__max_features=300, vectorizer__stop_words=english, total= 4.3s
[CV] classifier__n_neighbors=3, tfidf__use_idf=False, vectorizer__max_features=300, vectorizer__stop_words=None
[CV] classifier__n_neighbors=3, tfidf__use_idf=False, vectorizer__max_features=300, vectorizer__stop_words=english, total= 3.9s
[CV] classifier__n_neighbors=3, tfidf__use_idf=False, vectorizer__max_features=300, vectorizer__stop_words=None
[CV] classifier__n_neighbors=3, tfidf__use_idf=False, vectorizer__max_features=300, vectorizer__stop_words=english, total= 4.1s
[CV] classifier__n_neighbors=3, tfidf__use_idf=False, vectorizer__max_features=350, vectorizer__stop_words=english
[CV] classifier__n_neighbors=3, tfidf__use_idf=False, vectorizer__max_features=300, vectorizer__stop_words=None, total= 3.6s
[CV] classifier__n_neighbors=3, tfidf__use_idf=False, vectorizer__max_features=350, vectorizer__stop_words=english
[CV] classifier__n_neighbors=3, tfidf__use_idf=False, vectorizer__max_features=300, vectorizer__stop_words=None, total= 3.5s
[CV] classifier__n_neighbors=3, tfidf__use_idf=False, vectorizer__max_features=350, vectorizer__stop_words=english
[CV] classifier__n_neighbors=3, tfidf__use_idf=False, vectorizer__max_features=350, vectorizer__stop_words=english, total= 3.2s
[CV] classifier__n_neighbors=3, tfidf__use_idf=False, vectorizer__max_features=350, vectorizer__stop_words=None
[CV] classifier__n_neighbors=3, tfidf__use_idf=False, vectorizer__max_features=300, vectorizer__stop_words=None, total= 3.9s
[CV] classifier__n_neighbors=3, tfidf__use_idf=False, vectorizer__max_features=350, vectorizer__stop_words=None
[CV] classifier__n_neighbors=3, tfidf__use_idf=False, vectorizer__max_features=350, vectorizer__stop_words=english, total= 3.0s
[CV] classifier__n_neighbors=3, tfidf__use_idf=False, vectorizer__max_features=350, vectorizer__stop_words=None
[CV] classifier__n_neighbors=3, tfidf__use_idf=False, vectorizer__max_features=350, vectorizer__stop_words=english, total= 3.0s
[CV] classifier__n_neighbors=3, tfidf__use_idf=False, vectorizer__max_features=350, vectorizer__stop_words=None, total= 3.5s
[CV] classifier__n_neighbors=3, tfidf__use_idf=False, vectorizer__max_features=350, vectorizer__stop_words=None, total= 3.5s
[CV] classifier__n_neighbors=3, tfidf__use_idf=False, vectorizer__max_features=350, vectorizer__stop_words=None, total= 3.7s
GridSearchCV(cv=3, error_score='raise',
estimator=Pipeline(memory=None,
steps=[('vectorizer', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
lowercase=True, max_df=1.0, max_features=None, min_df=1,
ngram_range=(1, 1), preprocessor=None, stop_words=None,
...owski',
metric_params=None, n_jobs=1, n_neighbors=5, p=2,
weights='uniform'))]),
fit_params=None, iid=True, n_jobs=-1,
param_grid={'vectorizer__max_features': [250, 300, 350], 'vectorizer__stop_words': ['english', None], 'tfidf__use_idf': [True, False], 'classifier__n_neighbors': [2, 3]},
pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
scoring=None, verbose=2)
pd.DataFrame(grid.cv_results_).sort_values(by = 'rank_test_score')
mean_fit_time | mean_score_time | mean_test_score | mean_train_score | param_classifier__n_neighbors | param_tfidf__use_idf | param_vectorizer__max_features | param_vectorizer__stop_words | params | rank_test_score | split0_test_score | split0_train_score | split1_test_score | split1_train_score | split2_test_score | split2_train_score | std_fit_time | std_score_time | std_test_score | std_train_score | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
17 | 1.170683 | 4.161584 | 0.7867 | 0.89755 | 3 | True | 350 | None | {'classifier__n_neighbors': 3, 'tfidf__use_idf': True, 'vectorizer__max_features': 350, 'vectorizer__stop_words': None} | 1 | 0.788542 | 0.899190 | 0.787579 | 0.895155 | 0.783978 | 0.898305 | 0.100817 | 0.107977 | 0.001964 | 0.001732 |
15 | 0.965019 | 4.175408 | 0.7812 | 0.89220 | 3 | True | 300 | None | {'classifier__n_neighbors': 3, 'tfidf__use_idf': True, 'vectorizer__max_features': 300, 'vectorizer__stop_words': None} | 2 | 0.781944 | 0.887339 | 0.783378 | 0.892305 | 0.778278 | 0.896955 | 0.119842 | 0.236494 | 0.002148 | 0.003927 |
16 | 0.928742 | 3.072961 | 0.7773 | 0.88515 | 3 | True | 350 | english | {'classifier__n_neighbors': 3, 'tfidf__use_idf': True, 'vectorizer__max_features': 350, 'vectorizer__stop_words': 'e... | 3 | 0.766647 | 0.890489 | 0.782778 | 0.883156 | 0.782478 | 0.881806 | 0.073366 | 0.057893 | 0.007535 | 0.003815 |
14 | 0.944672 | 3.026088 | 0.7748 | 0.88720 | 3 | True | 300 | english | {'classifier__n_neighbors': 3, 'tfidf__use_idf': True, 'vectorizer__max_features': 300, 'vectorizer__stop_words': 'e... | 4 | 0.763047 | 0.892439 | 0.777678 | 0.885256 | 0.783678 | 0.883906 | 0.016476 | 0.507439 | 0.008665 | 0.003745 |
12 | 0.990014 | 2.508165 | 0.7735 | 0.88150 | 3 | True | 250 | english | {'classifier__n_neighbors': 3, 'tfidf__use_idf': True, 'vectorizer__max_features': 250, 'vectorizer__stop_words': 'e... | 5 | 0.762747 | 0.883438 | 0.775578 | 0.879556 | 0.782178 | 0.881506 | 0.109654 | 0.137139 | 0.008068 | 0.001585 |
13 | 0.830728 | 4.362559 | 0.7721 | 0.88935 | 3 | True | 250 | None | {'classifier__n_neighbors': 3, 'tfidf__use_idf': True, 'vectorizer__max_features': 250, 'vectorizer__stop_words': None} | 6 | 0.775345 | 0.888539 | 0.764776 | 0.887356 | 0.776178 | 0.892155 | 0.091845 | 0.802333 | 0.005189 | 0.002042 |
22 | 0.727000 | 2.340584 | 0.7619 | 0.87875 | 3 | False | 350 | english | {'classifier__n_neighbors': 3, 'tfidf__use_idf': False, 'vectorizer__max_features': 350, 'vectorizer__stop_words': '... | 7 | 0.754649 | 0.883738 | 0.762076 | 0.878656 | 0.768977 | 0.873856 | 0.043127 | 0.072686 | 0.005851 | 0.004035 |
20 | 0.843213 | 3.285683 | 0.7576 | 0.87725 | 3 | False | 300 | english | {'classifier__n_neighbors': 3, 'tfidf__use_idf': False, 'vectorizer__max_features': 300, 'vectorizer__stop_words': '... | 8 | 0.745651 | 0.882238 | 0.755476 | 0.877756 | 0.771677 | 0.871756 | 0.089618 | 0.071229 | 0.010731 | 0.004294 |
4 | 0.578924 | 2.237310 | 0.7527 | 0.90920 | 2 | True | 350 | english | {'classifier__n_neighbors': 2, 'tfidf__use_idf': True, 'vectorizer__max_features': 350, 'vectorizer__stop_words': 'e... | 9 | 0.749550 | 0.911491 | 0.762376 | 0.909255 | 0.746175 | 0.906855 | 0.018350 | 0.628051 | 0.006979 | 0.001893 |
18 | 0.987835 | 3.083754 | 0.7498 | 0.87475 | 3 | False | 250 | english | {'classifier__n_neighbors': 3, 'tfidf__use_idf': False, 'vectorizer__max_features': 250, 'vectorizer__stop_words': '... | 10 | 0.750150 | 0.879238 | 0.743174 | 0.871156 | 0.756076 | 0.873856 | 0.182437 | 0.243110 | 0.005272 | 0.003359 |
2 | 0.580391 | 1.821766 | 0.7479 | 0.90760 | 2 | True | 300 | english | {'classifier__n_neighbors': 2, 'tfidf__use_idf': True, 'vectorizer__max_features': 300, 'vectorizer__stop_words': 'e... | 11 | 0.734553 | 0.908491 | 0.756376 | 0.908355 | 0.752775 | 0.905955 | 0.021492 | 0.085388 | 0.009553 | 0.001165 |
10 | 0.797745 | 2.314199 | 0.7430 | 0.91235 | 2 | False | 350 | english | {'classifier__n_neighbors': 2, 'tfidf__use_idf': False, 'vectorizer__max_features': 350, 'vectorizer__stop_words': '... | 12 | 0.745951 | 0.916442 | 0.739574 | 0.915404 | 0.743474 | 0.905205 | 0.078513 | 0.027637 | 0.002625 | 0.005070 |
8 | 0.582446 | 1.912485 | 0.7391 | 0.90970 | 2 | False | 300 | english | {'classifier__n_neighbors': 2, 'tfidf__use_idf': False, 'vectorizer__max_features': 300, 'vectorizer__stop_words': '... | 13 | 0.737552 | 0.909691 | 0.738674 | 0.914954 | 0.741074 | 0.904455 | 0.025495 | 0.020054 | 0.001469 | 0.004286 |
0 | 0.540779 | 1.746032 | 0.7366 | 0.90365 | 2 | True | 250 | english | {'classifier__n_neighbors': 2, 'tfidf__use_idf': True, 'vectorizer__max_features': 250, 'vectorizer__stop_words': 'e... | 14 | 0.729454 | 0.906691 | 0.738374 | 0.900405 | 0.741974 | 0.903855 | 0.001660 | 0.020781 | 0.005263 | 0.002570 |
5 | 0.599577 | 2.415360 | 0.7362 | 0.88985 | 2 | True | 350 | None | {'classifier__n_neighbors': 2, 'tfidf__use_idf': True, 'vectorizer__max_features': 350, 'vectorizer__stop_words': None} | 15 | 0.739352 | 0.888239 | 0.734473 | 0.889756 | 0.734773 | 0.891555 | 0.015238 | 0.050224 | 0.002233 | 0.001356 |
6 | 0.547162 | 1.846357 | 0.7324 | 0.90300 | 2 | False | 250 | english | {'classifier__n_neighbors': 2, 'tfidf__use_idf': False, 'vectorizer__max_features': 250, 'vectorizer__stop_words': '... | 16 | 0.732454 | 0.902490 | 0.731773 | 0.904455 | 0.732973 | 0.902055 | 0.006475 | 0.046370 | 0.000491 | 0.001044 |
3 | 0.613891 | 2.380259 | 0.7270 | 0.88310 | 2 | True | 300 | None | {'classifier__n_neighbors': 2, 'tfidf__use_idf': True, 'vectorizer__max_features': 300, 'vectorizer__stop_words': None} | 17 | 0.732454 | 0.880288 | 0.725773 | 0.880456 | 0.722772 | 0.888556 | 0.026856 | 0.002337 | 0.004047 | 0.003858 |
23 | 0.752127 | 2.854057 | 0.7186 | 0.86370 | 3 | False | 350 | None | {'classifier__n_neighbors': 3, 'tfidf__use_idf': False, 'vectorizer__max_features': 350, 'vectorizer__stop_words': N... | 18 | 0.723155 | 0.860186 | 0.714071 | 0.863507 | 0.718572 | 0.867407 | 0.028106 | 0.121911 | 0.003709 | 0.002951 |
1 | 0.617535 | 2.401295 | 0.7165 | 0.87620 | 2 | True | 250 | None | {'classifier__n_neighbors': 2, 'tfidf__use_idf': True, 'vectorizer__max_features': 250, 'vectorizer__stop_words': None} | 19 | 0.721356 | 0.879088 | 0.710771 | 0.875056 | 0.717372 | 0.874456 | 0.045237 | 0.139353 | 0.004365 | 0.002057 |
21 | 0.813277 | 2.858795 | 0.7136 | 0.86235 | 3 | False | 300 | None | {'classifier__n_neighbors': 3, 'tfidf__use_idf': False, 'vectorizer__max_features': 300, 'vectorizer__stop_words': N... | 20 | 0.717157 | 0.862886 | 0.711671 | 0.862157 | 0.711971 | 0.862007 | 0.093577 | 0.103089 | 0.002518 | 0.000384 |
19 | 0.916645 | 3.257190 | 0.7118 | 0.85780 | 3 | False | 250 | None | {'classifier__n_neighbors': 3, 'tfidf__use_idf': False, 'vectorizer__max_features': 250, 'vectorizer__stop_words': N... | 21 | 0.712058 | 0.860036 | 0.712271 | 0.857207 | 0.711071 | 0.856157 | 0.111296 | 0.131830 | 0.000523 | 0.001638 |
11 | 0.863508 | 3.169554 | 0.6517 | 0.82095 | 2 | False | 350 | None | {'classifier__n_neighbors': 2, 'tfidf__use_idf': False, 'vectorizer__max_features': 350, 'vectorizer__stop_words': N... | 22 | 0.651170 | 0.817582 | 0.650165 | 0.821209 | 0.653765 | 0.824059 | 0.063974 | 0.086127 | 0.001517 | 0.002651 |
9 | 0.706671 | 2.628934 | 0.6447 | 0.81660 | 2 | False | 300 | None | {'classifier__n_neighbors': 2, 'tfidf__use_idf': False, 'vectorizer__max_features': 300, 'vectorizer__stop_words': N... | 23 | 0.639772 | 0.812031 | 0.641764 | 0.817609 | 0.652565 | 0.820159 | 0.100523 | 0.220924 | 0.005620 | 0.003394 |
7 | 0.590949 | 2.152607 | 0.6382 | 0.80815 | 2 | False | 250 | None | {'classifier__n_neighbors': 2, 'tfidf__use_idf': False, 'vectorizer__max_features': 250, 'vectorizer__stop_words': N... | 24 | 0.633473 | 0.807831 | 0.632763 | 0.807410 | 0.648365 | 0.809210 | 0.007630 | 0.036215 | 0.007193 | 0.000769 |
best_pipeline = grid.best_estimator_
print('Train fit')
train_prediction = best_pipeline.predict(wine_df['description'])
print(classification_report(train_prediction, wine_df['rating']))
print('Test fit')
test_sample = wine_df_full.sample(n = 10000, replace=False)
test_prediction = best_pipeline.predict(test_sample['description'])
print(classification_report(test_prediction, test_sample['rating']))
Train fit
precision recall f1-score support
High 0.84 0.90 0.87 3716
Low 0.94 0.90 0.92 6284
avg / total 0.90 0.90 0.90 10000
Test fit
precision recall f1-score support
High 0.70 0.77 0.74 3602
Low 0.86 0.82 0.84 6398
avg / total 0.81 0.80 0.80 10000
print(accuracy_score(test_prediction, test_sample['rating']))
0.8014