%matplotlib inline

import pandas as pd

pd.set_option('display.max_colwidth', 120)
wine_df_full = pd.read_csv('data/wine_reviews.csv')

# let us reduce down our dataset so that it more manageable. 
wine_df = wine_df_full.sample(n = 10000)
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.neighbors import KNeighborsClassifier

from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
pipeline = Pipeline([
                     ('vectorizer' , CountVectorizer()),
                     ('classifier' , KNeighborsClassifier())
                    ])

parameters = {'vectorizer__max_features' : [300, 500, 700],
              'classifier__n_neighbors' : [2,3, 5] }


grid        = GridSearchCV(pipeline,
                           parameters,
                           return_train_score = True,
                           cv = 3,
                           n_jobs = -1, 
                           verbose = 2)
grid.fit(wine_df['description'], wine_df['rating'])
Fitting 3 folds for each of 9 candidates, totalling 27 fits
[CV] classifier__n_neighbors=2, vectorizer__max_features=300 .........
[CV] classifier__n_neighbors=2, vectorizer__max_features=300 .........
[CV] classifier__n_neighbors=2, vectorizer__max_features=300 .........
[CV] classifier__n_neighbors=2, vectorizer__max_features=500 .........
[CV]  classifier__n_neighbors=2, vectorizer__max_features=300, total=   4.3s
[CV] classifier__n_neighbors=2, vectorizer__max_features=500 .........
[CV]  classifier__n_neighbors=2, vectorizer__max_features=300, total=   4.8s
[CV]  classifier__n_neighbors=2, vectorizer__max_features=300, total=   4.6s
[CV] classifier__n_neighbors=2, vectorizer__max_features=500 .........
[CV] classifier__n_neighbors=2, vectorizer__max_features=700 .........
[CV]  classifier__n_neighbors=2, vectorizer__max_features=500, total=   4.7s
[CV] classifier__n_neighbors=2, vectorizer__max_features=700 .........
[CV]  classifier__n_neighbors=2, vectorizer__max_features=500, total=   3.2s
[CV] classifier__n_neighbors=2, vectorizer__max_features=700 .........
[CV]  classifier__n_neighbors=2, vectorizer__max_features=700, total=   3.2s
[CV] classifier__n_neighbors=3, vectorizer__max_features=300 .........
[CV]  classifier__n_neighbors=2, vectorizer__max_features=700, total=   3.2s
[CV] classifier__n_neighbors=3, vectorizer__max_features=300 .........
[CV]  classifier__n_neighbors=2, vectorizer__max_features=500, total=   3.1s
[CV] classifier__n_neighbors=3, vectorizer__max_features=300 .........
[CV]  classifier__n_neighbors=2, vectorizer__max_features=700, total=   3.0s
[CV] classifier__n_neighbors=3, vectorizer__max_features=500 .........
[CV]  classifier__n_neighbors=3, vectorizer__max_features=300, total=   3.2s
[CV] classifier__n_neighbors=3, vectorizer__max_features=500 .........
[CV]  classifier__n_neighbors=3, vectorizer__max_features=300, total=   3.2s
[CV] classifier__n_neighbors=3, vectorizer__max_features=500 .........
[CV]  classifier__n_neighbors=3, vectorizer__max_features=300, total=   3.0s
[CV] classifier__n_neighbors=3, vectorizer__max_features=700 .........
[CV]  classifier__n_neighbors=3, vectorizer__max_features=500, total=   3.3s
[CV] classifier__n_neighbors=3, vectorizer__max_features=700 .........
[CV]  classifier__n_neighbors=3, vectorizer__max_features=500, total=   3.3s
[CV] classifier__n_neighbors=3, vectorizer__max_features=700 .........
[CV]  classifier__n_neighbors=3, vectorizer__max_features=500, total=   3.3s
[CV] classifier__n_neighbors=5, vectorizer__max_features=300 .........
[CV]  classifier__n_neighbors=3, vectorizer__max_features=700, total=   3.3s
[CV] classifier__n_neighbors=5, vectorizer__max_features=300 .........
[CV]  classifier__n_neighbors=3, vectorizer__max_features=700, total=   3.4s
[CV] classifier__n_neighbors=5, vectorizer__max_features=300 .........
[CV]  classifier__n_neighbors=3, vectorizer__max_features=700, total=   3.5s
[CV] classifier__n_neighbors=5, vectorizer__max_features=500 .........
[CV]  classifier__n_neighbors=5, vectorizer__max_features=300, total=   3.6s
[CV] classifier__n_neighbors=5, vectorizer__max_features=500 .........
[CV]  classifier__n_neighbors=5, vectorizer__max_features=300, total=   3.7s
[CV] classifier__n_neighbors=5, vectorizer__max_features=500 .........
[CV]  classifier__n_neighbors=5, vectorizer__max_features=300, total=   3.7s
[CV] classifier__n_neighbors=5, vectorizer__max_features=700 .........
[CV]  classifier__n_neighbors=5, vectorizer__max_features=500, total=   3.9s
[CV] classifier__n_neighbors=5, vectorizer__max_features=700 .........
[CV]  classifier__n_neighbors=5, vectorizer__max_features=500, total=   3.6s
[CV]  classifier__n_neighbors=5, vectorizer__max_features=500, total=   3.5s
[CV] classifier__n_neighbors=5, vectorizer__max_features=700 .........
[CV]  classifier__n_neighbors=5, vectorizer__max_features=700, total=   3.4s
[CV]  classifier__n_neighbors=5, vectorizer__max_features=700, total=   3.2s
[CV]  classifier__n_neighbors=5, vectorizer__max_features=700, total=   2.8s

GridSearchCV(cv=3, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('vectorizer', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
       ...owski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform'))]),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'vectorizer__max_features': [300, 500, 700], 'classifier__n_neighbors': [2, 3, 5]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=2)
pd.DataFrame(grid.cv_results_).sort_values(by = 'rank_test_score')
mean_fit_time mean_score_time mean_test_score mean_train_score param_classifier__n_neighbors param_vectorizer__max_features params rank_test_score split0_test_score split0_train_score split1_test_score split1_train_score split2_test_score split2_train_score std_fit_time std_score_time std_test_score std_train_score
0 0.650011 3.911422 0.7290 0.95490 2 300 {'classifier__n_neighbors': 2, 'vectorizer__max_features': 300} 1 0.740552 0.950945 0.727273 0.958452 0.719172 0.955302 0.005341 0.207406 0.008814 0.003078
1 0.663731 3.014868 0.7212 0.96905 2 500 {'classifier__n_neighbors': 2, 'vectorizer__max_features': 500} 2 0.724655 0.967897 0.726373 0.971651 0.712571 0.967602 0.024517 0.748231 0.006141 0.001843
2 0.631577 2.498195 0.7166 0.97260 2 700 {'classifier__n_neighbors': 2, 'vectorizer__max_features': 700} 3 0.726155 0.971497 0.714671 0.972551 0.708971 0.973751 0.011422 0.080301 0.007147 0.000921
3 0.685521 2.450972 0.7035 0.80460 3 300 {'classifier__n_neighbors': 3, 'vectorizer__max_features': 300} 4 0.711758 0.804530 0.705071 0.808910 0.693669 0.800360 0.045753 0.052470 0.007468 0.003491
6 0.684377 2.984071 0.6984 0.75990 5 300 {'classifier__n_neighbors': 5, 'vectorizer__max_features': 300} 5 0.707558 0.761026 0.697570 0.764212 0.690069 0.754462 0.029646 0.070882 0.007164 0.004059
4 0.676721 2.596920 0.6824 0.77825 3 500 {'classifier__n_neighbors': 3, 'vectorizer__max_features': 500} 6 0.688062 0.780978 0.686169 0.776361 0.672967 0.777411 0.068777 0.043217 0.006714 0.001976
7 0.697600 2.933428 0.6741 0.72870 5 500 {'classifier__n_neighbors': 5, 'vectorizer__max_features': 500} 7 0.679964 0.724572 0.678368 0.732263 0.663966 0.729264 0.013292 0.168813 0.007195 0.003165
5 0.721103 2.678074 0.6727 0.76480 3 700 {'classifier__n_neighbors': 3, 'vectorizer__max_features': 700} 8 0.675165 0.764326 0.675968 0.769612 0.666967 0.760462 0.071410 0.134303 0.004067 0.003750
8 0.589188 2.520334 0.6653 0.71335 5 700 {'classifier__n_neighbors': 5, 'vectorizer__max_features': 700} 9 0.666167 0.711671 0.672967 0.716514 0.656766 0.711864 0.104428 0.161456 0.006642 0.002239
best_pipeline = grid.best_estimator_

best_pipeline.get_params()
{'classifier': KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
            metric_params=None, n_jobs=1, n_neighbors=2, p=2,
            weights='uniform'),
 'classifier__algorithm': 'auto',
 'classifier__leaf_size': 30,
 'classifier__metric': 'minkowski',
 'classifier__metric_params': None,
 'classifier__n_jobs': 1,
 'classifier__n_neighbors': 2,
 'classifier__p': 2,
 'classifier__weights': 'uniform',
 'memory': None,
 'steps': [('vectorizer',
   CountVectorizer(analyzer='word', binary=False, decode_error='strict',
           dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
           lowercase=True, max_df=1.0, max_features=300, min_df=1,
           ngram_range=(1, 1), preprocessor=None, stop_words=None,
           strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
           tokenizer=None, vocabulary=None)),
  ('classifier',
   KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
              metric_params=None, n_jobs=1, n_neighbors=2, p=2,
              weights='uniform'))],
 'vectorizer': CountVectorizer(analyzer='word', binary=False, decode_error='strict',
         dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
         lowercase=True, max_df=1.0, max_features=300, min_df=1,
         ngram_range=(1, 1), preprocessor=None, stop_words=None,
         strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
         tokenizer=None, vocabulary=None),
 'vectorizer__analyzer': 'word',
 'vectorizer__binary': False,
 'vectorizer__decode_error': 'strict',
 'vectorizer__dtype': numpy.int64,
 'vectorizer__encoding': 'utf-8',
 'vectorizer__input': 'content',
 'vectorizer__lowercase': True,
 'vectorizer__max_df': 1.0,
 'vectorizer__max_features': 300,
 'vectorizer__min_df': 1,
 'vectorizer__ngram_range': (1, 1),
 'vectorizer__preprocessor': None,
 'vectorizer__stop_words': None,
 'vectorizer__strip_accents': None,
 'vectorizer__token_pattern': '(?u)\\b\\w\\w+\\b',
 'vectorizer__tokenizer': None,
 'vectorizer__vocabulary': None}
train_prediction = best_pipeline.predict(wine_df['description'])

print(classification_report(train_prediction, wine_df['rating']))
             precision    recall  f1-score   support

       High       1.00      0.90      0.95      4390
        Low       0.93      1.00      0.96      5610

avg / total       0.96      0.96      0.96     10000


test_sample = wine_df_full.sample(n = 10000, replace=False)

test_prediction = best_pipeline.predict(test_sample['description'])

print(classification_report(test_prediction, test_sample['rating']))
             precision    recall  f1-score   support

       High       0.60      0.77      0.68      3092
        Low       0.88      0.77      0.82      6908

avg / total       0.80      0.77      0.78     10000


from sklearn.feature_extraction.text import TfidfTransformer
pipeline = Pipeline([
                     ('vectorizer' , CountVectorizer()),
                     ('tfidf'      , TfidfTransformer()),
                     ('classifier' , KNeighborsClassifier())
                    ])

parameters = {'vectorizer__max_features' : [250, 300, 350],
              'vectorizer__stop_words'   : ['english', None],
              'tfidf__use_idf'           : [True, False],
              'classifier__n_neighbors'  : [2, 3] }



grid        = GridSearchCV(pipeline,
                           parameters,
                           return_train_score = True,
                           cv = 3,
                           n_jobs = -1, 
                           verbose = 2)
grid.fit(wine_df['description'], wine_df['rating'])
Fitting 3 folds for each of 24 candidates, totalling 72 fits
[CV] classifier__n_neighbors=2, tfidf__use_idf=True, vectorizer__max_features=250, vectorizer__stop_words=english 
[CV] classifier__n_neighbors=2, tfidf__use_idf=True, vectorizer__max_features=250, vectorizer__stop_words=english 
[CV] classifier__n_neighbors=2, tfidf__use_idf=True, vectorizer__max_features=250, vectorizer__stop_words=english 
[CV] classifier__n_neighbors=2, tfidf__use_idf=True, vectorizer__max_features=250, vectorizer__stop_words=None 
[CV]  classifier__n_neighbors=2, tfidf__use_idf=True, vectorizer__max_features=250, vectorizer__stop_words=english, total=   2.3s
[CV] classifier__n_neighbors=2, tfidf__use_idf=True, vectorizer__max_features=250, vectorizer__stop_words=None 
[CV]  classifier__n_neighbors=2, tfidf__use_idf=True, vectorizer__max_features=250, vectorizer__stop_words=english, total=   2.3s
[CV]  classifier__n_neighbors=2, tfidf__use_idf=True, vectorizer__max_features=250, vectorizer__stop_words=english, total=   2.3s
[CV] classifier__n_neighbors=2, tfidf__use_idf=True, vectorizer__max_features=250, vectorizer__stop_words=None 
[CV] classifier__n_neighbors=2, tfidf__use_idf=True, vectorizer__max_features=300, vectorizer__stop_words=english 
[CV]  classifier__n_neighbors=2, tfidf__use_idf=True, vectorizer__max_features=250, vectorizer__stop_words=None, total=   2.8s
[CV] classifier__n_neighbors=2, tfidf__use_idf=True, vectorizer__max_features=300, vectorizer__stop_words=english 
[CV]  classifier__n_neighbors=2, tfidf__use_idf=True, vectorizer__max_features=300, vectorizer__stop_words=english, total=   2.6s
[CV] classifier__n_neighbors=2, tfidf__use_idf=True, vectorizer__max_features=300, vectorizer__stop_words=english 
[CV]  classifier__n_neighbors=2, tfidf__use_idf=True, vectorizer__max_features=300, vectorizer__stop_words=english, total=   2.3s
[CV] classifier__n_neighbors=2, tfidf__use_idf=True, vectorizer__max_features=300, vectorizer__stop_words=None 
[CV]  classifier__n_neighbors=2, tfidf__use_idf=True, vectorizer__max_features=250, vectorizer__stop_words=None, total=   3.2s
[CV] classifier__n_neighbors=2, tfidf__use_idf=True, vectorizer__max_features=300, vectorizer__stop_words=None 
[CV]  classifier__n_neighbors=2, tfidf__use_idf=True, vectorizer__max_features=250, vectorizer__stop_words=None, total=   3.1s
[CV] classifier__n_neighbors=2, tfidf__use_idf=True, vectorizer__max_features=300, vectorizer__stop_words=None 
[CV]  classifier__n_neighbors=2, tfidf__use_idf=True, vectorizer__max_features=300, vectorizer__stop_words=english, total=   2.4s
[CV] classifier__n_neighbors=2, tfidf__use_idf=True, vectorizer__max_features=350, vectorizer__stop_words=english 
[CV]  classifier__n_neighbors=2, tfidf__use_idf=True, vectorizer__max_features=300, vectorizer__stop_words=None, total=   3.0s
[CV] classifier__n_neighbors=2, tfidf__use_idf=True, vectorizer__max_features=350, vectorizer__stop_words=english 
[CV]  classifier__n_neighbors=2, tfidf__use_idf=True, vectorizer__max_features=300, vectorizer__stop_words=None, total=   3.0s
[CV] classifier__n_neighbors=2, tfidf__use_idf=True, vectorizer__max_features=350, vectorizer__stop_words=english 
[CV]  classifier__n_neighbors=2, tfidf__use_idf=True, vectorizer__max_features=300, vectorizer__stop_words=None, total=   3.0s
[CV] classifier__n_neighbors=2, tfidf__use_idf=True, vectorizer__max_features=350, vectorizer__stop_words=None 
[CV]  classifier__n_neighbors=2, tfidf__use_idf=True, vectorizer__max_features=350, vectorizer__stop_words=english, total=   3.7s
[CV] classifier__n_neighbors=2, tfidf__use_idf=True, vectorizer__max_features=350, vectorizer__stop_words=None 
[CV]  classifier__n_neighbors=2, tfidf__use_idf=True, vectorizer__max_features=350, vectorizer__stop_words=english, total=   2.4s
[CV] classifier__n_neighbors=2, tfidf__use_idf=True, vectorizer__max_features=350, vectorizer__stop_words=None 
[CV]  classifier__n_neighbors=2, tfidf__use_idf=True, vectorizer__max_features=350, vectorizer__stop_words=english, total=   2.4s
[CV] classifier__n_neighbors=2, tfidf__use_idf=False, vectorizer__max_features=250, vectorizer__stop_words=english 
[CV]  classifier__n_neighbors=2, tfidf__use_idf=True, vectorizer__max_features=350, vectorizer__stop_words=None, total=   3.0s
[CV] classifier__n_neighbors=2, tfidf__use_idf=False, vectorizer__max_features=250, vectorizer__stop_words=english 
[CV]  classifier__n_neighbors=2, tfidf__use_idf=True, vectorizer__max_features=350, vectorizer__stop_words=None, total=   3.1s
[CV] classifier__n_neighbors=2, tfidf__use_idf=False, vectorizer__max_features=250, vectorizer__stop_words=english 
[CV]  classifier__n_neighbors=2, tfidf__use_idf=False, vectorizer__max_features=250, vectorizer__stop_words=english, total=   2.4s
[CV] classifier__n_neighbors=2, tfidf__use_idf=False, vectorizer__max_features=250, vectorizer__stop_words=None 
[CV]  classifier__n_neighbors=2, tfidf__use_idf=True, vectorizer__max_features=350, vectorizer__stop_words=None, total=   3.0s
[CV] classifier__n_neighbors=2, tfidf__use_idf=False, vectorizer__max_features=250, vectorizer__stop_words=None 
[CV]  classifier__n_neighbors=2, tfidf__use_idf=False, vectorizer__max_features=250, vectorizer__stop_words=english, total=   2.4s
[CV] classifier__n_neighbors=2, tfidf__use_idf=False, vectorizer__max_features=250, vectorizer__stop_words=None 
[CV]  classifier__n_neighbors=2, tfidf__use_idf=False, vectorizer__max_features=250, vectorizer__stop_words=english, total=   2.5s
[CV] classifier__n_neighbors=2, tfidf__use_idf=False, vectorizer__max_features=300, vectorizer__stop_words=english 
[CV]  classifier__n_neighbors=2, tfidf__use_idf=False, vectorizer__max_features=250, vectorizer__stop_words=None, total=   2.8s
[CV] classifier__n_neighbors=2, tfidf__use_idf=False, vectorizer__max_features=300, vectorizer__stop_words=english 
[CV]  classifier__n_neighbors=2, tfidf__use_idf=False, vectorizer__max_features=250, vectorizer__stop_words=None, total=   2.7s
[CV] classifier__n_neighbors=2, tfidf__use_idf=False, vectorizer__max_features=300, vectorizer__stop_words=english 
[CV]  classifier__n_neighbors=2, tfidf__use_idf=False, vectorizer__max_features=250, vectorizer__stop_words=None, total=   2.8s
[CV] classifier__n_neighbors=2, tfidf__use_idf=False, vectorizer__max_features=300, vectorizer__stop_words=None 
[CV]  classifier__n_neighbors=2, tfidf__use_idf=False, vectorizer__max_features=300, vectorizer__stop_words=english, total=   2.5s
[CV] classifier__n_neighbors=2, tfidf__use_idf=False, vectorizer__max_features=300, vectorizer__stop_words=None 
[CV]  classifier__n_neighbors=2, tfidf__use_idf=False, vectorizer__max_features=300, vectorizer__stop_words=english, total=   2.5s
[CV] classifier__n_neighbors=2, tfidf__use_idf=False, vectorizer__max_features=300, vectorizer__stop_words=None 
[CV]  classifier__n_neighbors=2, tfidf__use_idf=False, vectorizer__max_features=300, vectorizer__stop_words=english, total=   2.5s
[CV] classifier__n_neighbors=2, tfidf__use_idf=False, vectorizer__max_features=350, vectorizer__stop_words=english 
[CV]  classifier__n_neighbors=2, tfidf__use_idf=False, vectorizer__max_features=300, vectorizer__stop_words=None, total=   3.0s
[CV] classifier__n_neighbors=2, tfidf__use_idf=False, vectorizer__max_features=350, vectorizer__stop_words=english 
[CV]  classifier__n_neighbors=2, tfidf__use_idf=False, vectorizer__max_features=300, vectorizer__stop_words=None, total=   3.3s
[CV] classifier__n_neighbors=2, tfidf__use_idf=False, vectorizer__max_features=350, vectorizer__stop_words=english 
[CV]  classifier__n_neighbors=2, tfidf__use_idf=False, vectorizer__max_features=300, vectorizer__stop_words=None, total=   3.7s
[CV] classifier__n_neighbors=2, tfidf__use_idf=False, vectorizer__max_features=350, vectorizer__stop_words=None 
[CV]  classifier__n_neighbors=2, tfidf__use_idf=False, vectorizer__max_features=350, vectorizer__stop_words=english, total=   3.2s
[CV] classifier__n_neighbors=2, tfidf__use_idf=False, vectorizer__max_features=350, vectorizer__stop_words=None 
[CV]  classifier__n_neighbors=2, tfidf__use_idf=False, vectorizer__max_features=350, vectorizer__stop_words=english, total=   3.0s
[CV] classifier__n_neighbors=2, tfidf__use_idf=False, vectorizer__max_features=350, vectorizer__stop_words=None 
[CV]  classifier__n_neighbors=2, tfidf__use_idf=False, vectorizer__max_features=350, vectorizer__stop_words=english, total=   3.1s
[CV] classifier__n_neighbors=3, tfidf__use_idf=True, vectorizer__max_features=250, vectorizer__stop_words=english 
[CV]  classifier__n_neighbors=2, tfidf__use_idf=False, vectorizer__max_features=350, vectorizer__stop_words=None, total=   4.1s
[CV] classifier__n_neighbors=3, tfidf__use_idf=True, vectorizer__max_features=250, vectorizer__stop_words=english 
[CV]  classifier__n_neighbors=2, tfidf__use_idf=False, vectorizer__max_features=350, vectorizer__stop_words=None, total=   3.9s
[CV] classifier__n_neighbors=3, tfidf__use_idf=True, vectorizer__max_features=250, vectorizer__stop_words=english 
[CV]  classifier__n_neighbors=2, tfidf__use_idf=False, vectorizer__max_features=350, vectorizer__stop_words=None, total=   4.1s
[CV] classifier__n_neighbors=3, tfidf__use_idf=True, vectorizer__max_features=250, vectorizer__stop_words=None 
[CV]  classifier__n_neighbors=3, tfidf__use_idf=True, vectorizer__max_features=250, vectorizer__stop_words=english, total=   3.8s
[CV] classifier__n_neighbors=3, tfidf__use_idf=True, vectorizer__max_features=250, vectorizer__stop_words=None 
[CV]  classifier__n_neighbors=3, tfidf__use_idf=True, vectorizer__max_features=250, vectorizer__stop_words=english, total=   3.5s
[CV] classifier__n_neighbors=3, tfidf__use_idf=True, vectorizer__max_features=250, vectorizer__stop_words=None 
[CV]  classifier__n_neighbors=3, tfidf__use_idf=True, vectorizer__max_features=250, vectorizer__stop_words=english, total=   3.3s
[CV] classifier__n_neighbors=3, tfidf__use_idf=True, vectorizer__max_features=300, vectorizer__stop_words=english 
[CV]  classifier__n_neighbors=3, tfidf__use_idf=True, vectorizer__max_features=250, vectorizer__stop_words=None, total=   4.5s
[CV] classifier__n_neighbors=3, tfidf__use_idf=True, vectorizer__max_features=300, vectorizer__stop_words=english 
[CV]  classifier__n_neighbors=3, tfidf__use_idf=True, vectorizer__max_features=250, vectorizer__stop_words=None, total=   4.6s
[CV] classifier__n_neighbors=3, tfidf__use_idf=True, vectorizer__max_features=300, vectorizer__stop_words=english 
[CV]  classifier__n_neighbors=3, tfidf__use_idf=True, vectorizer__max_features=300, vectorizer__stop_words=english, total=   4.7s
[CV] classifier__n_neighbors=3, tfidf__use_idf=True, vectorizer__max_features=300, vectorizer__stop_words=None 
[CV]  classifier__n_neighbors=3, tfidf__use_idf=True, vectorizer__max_features=250, vectorizer__stop_words=None, total=   6.5s
[CV] classifier__n_neighbors=3, tfidf__use_idf=True, vectorizer__max_features=300, vectorizer__stop_words=None 
[CV]  classifier__n_neighbors=3, tfidf__use_idf=True, vectorizer__max_features=300, vectorizer__stop_words=english, total=   3.6s
[CV] classifier__n_neighbors=3, tfidf__use_idf=True, vectorizer__max_features=300, vectorizer__stop_words=None 
[CV]  classifier__n_neighbors=3, tfidf__use_idf=True, vectorizer__max_features=300, vectorizer__stop_words=english, total=   3.6s
[CV] classifier__n_neighbors=3, tfidf__use_idf=True, vectorizer__max_features=350, vectorizer__stop_words=english 
[CV]  classifier__n_neighbors=3, tfidf__use_idf=True, vectorizer__max_features=300, vectorizer__stop_words=None, total=   5.0s
[CV] classifier__n_neighbors=3, tfidf__use_idf=True, vectorizer__max_features=350, vectorizer__stop_words=english 
[CV]  classifier__n_neighbors=3, tfidf__use_idf=True, vectorizer__max_features=300, vectorizer__stop_words=None, total=   5.4s
[CV] classifier__n_neighbors=3, tfidf__use_idf=True, vectorizer__max_features=350, vectorizer__stop_words=english 
[CV]  classifier__n_neighbors=3, tfidf__use_idf=True, vectorizer__max_features=350, vectorizer__stop_words=english, total=   4.0s
[CV] classifier__n_neighbors=3, tfidf__use_idf=True, vectorizer__max_features=350, vectorizer__stop_words=None 
[CV]  classifier__n_neighbors=3, tfidf__use_idf=True, vectorizer__max_features=300, vectorizer__stop_words=None, total=   5.0s
[CV] classifier__n_neighbors=3, tfidf__use_idf=True, vectorizer__max_features=350, vectorizer__stop_words=None 
[CV]  classifier__n_neighbors=3, tfidf__use_idf=True, vectorizer__max_features=350, vectorizer__stop_words=english, total=   4.0s
[CV] classifier__n_neighbors=3, tfidf__use_idf=True, vectorizer__max_features=350, vectorizer__stop_words=None 
[CV]  classifier__n_neighbors=3, tfidf__use_idf=True, vectorizer__max_features=350, vectorizer__stop_words=english, total=   4.0s
[CV] classifier__n_neighbors=3, tfidf__use_idf=False, vectorizer__max_features=250, vectorizer__stop_words=english 
[CV]  classifier__n_neighbors=3, tfidf__use_idf=True, vectorizer__max_features=350, vectorizer__stop_words=None, total=   5.2s
[CV] classifier__n_neighbors=3, tfidf__use_idf=False, vectorizer__max_features=250, vectorizer__stop_words=english 
[CV]  classifier__n_neighbors=3, tfidf__use_idf=True, vectorizer__max_features=350, vectorizer__stop_words=None, total=   5.2s
[CV] classifier__n_neighbors=3, tfidf__use_idf=False, vectorizer__max_features=250, vectorizer__stop_words=english 
[CV]  classifier__n_neighbors=3, tfidf__use_idf=True, vectorizer__max_features=350, vectorizer__stop_words=None, total=   5.6s
[CV] classifier__n_neighbors=3, tfidf__use_idf=False, vectorizer__max_features=250, vectorizer__stop_words=None 
[CV]  classifier__n_neighbors=3, tfidf__use_idf=False, vectorizer__max_features=250, vectorizer__stop_words=english, total=   4.6s
[CV] classifier__n_neighbors=3, tfidf__use_idf=False, vectorizer__max_features=250, vectorizer__stop_words=None 
[CV]  classifier__n_neighbors=3, tfidf__use_idf=False, vectorizer__max_features=250, vectorizer__stop_words=english, total=   3.8s
[CV] classifier__n_neighbors=3, tfidf__use_idf=False, vectorizer__max_features=250, vectorizer__stop_words=None 
[CV]  classifier__n_neighbors=3, tfidf__use_idf=False, vectorizer__max_features=250, vectorizer__stop_words=english, total=   3.8s
[CV] classifier__n_neighbors=3, tfidf__use_idf=False, vectorizer__max_features=300, vectorizer__stop_words=english 
[CV]  classifier__n_neighbors=3, tfidf__use_idf=False, vectorizer__max_features=250, vectorizer__stop_words=None, total=   4.4s
[CV] classifier__n_neighbors=3, tfidf__use_idf=False, vectorizer__max_features=300, vectorizer__stop_words=english 
[CV]  classifier__n_neighbors=3, tfidf__use_idf=False, vectorizer__max_features=250, vectorizer__stop_words=None, total=   4.1s
[CV] classifier__n_neighbors=3, tfidf__use_idf=False, vectorizer__max_features=300, vectorizer__stop_words=english 
[CV]  classifier__n_neighbors=3, tfidf__use_idf=False, vectorizer__max_features=250, vectorizer__stop_words=None, total=   4.0s
[CV] classifier__n_neighbors=3, tfidf__use_idf=False, vectorizer__max_features=300, vectorizer__stop_words=None 
[CV]  classifier__n_neighbors=3, tfidf__use_idf=False, vectorizer__max_features=300, vectorizer__stop_words=english, total=   4.3s
[CV] classifier__n_neighbors=3, tfidf__use_idf=False, vectorizer__max_features=300, vectorizer__stop_words=None 
[CV]  classifier__n_neighbors=3, tfidf__use_idf=False, vectorizer__max_features=300, vectorizer__stop_words=english, total=   3.9s
[CV] classifier__n_neighbors=3, tfidf__use_idf=False, vectorizer__max_features=300, vectorizer__stop_words=None 
[CV]  classifier__n_neighbors=3, tfidf__use_idf=False, vectorizer__max_features=300, vectorizer__stop_words=english, total=   4.1s
[CV] classifier__n_neighbors=3, tfidf__use_idf=False, vectorizer__max_features=350, vectorizer__stop_words=english 
[CV]  classifier__n_neighbors=3, tfidf__use_idf=False, vectorizer__max_features=300, vectorizer__stop_words=None, total=   3.6s
[CV] classifier__n_neighbors=3, tfidf__use_idf=False, vectorizer__max_features=350, vectorizer__stop_words=english 
[CV]  classifier__n_neighbors=3, tfidf__use_idf=False, vectorizer__max_features=300, vectorizer__stop_words=None, total=   3.5s
[CV] classifier__n_neighbors=3, tfidf__use_idf=False, vectorizer__max_features=350, vectorizer__stop_words=english 
[CV]  classifier__n_neighbors=3, tfidf__use_idf=False, vectorizer__max_features=350, vectorizer__stop_words=english, total=   3.2s
[CV] classifier__n_neighbors=3, tfidf__use_idf=False, vectorizer__max_features=350, vectorizer__stop_words=None 
[CV]  classifier__n_neighbors=3, tfidf__use_idf=False, vectorizer__max_features=300, vectorizer__stop_words=None, total=   3.9s
[CV] classifier__n_neighbors=3, tfidf__use_idf=False, vectorizer__max_features=350, vectorizer__stop_words=None 
[CV]  classifier__n_neighbors=3, tfidf__use_idf=False, vectorizer__max_features=350, vectorizer__stop_words=english, total=   3.0s
[CV] classifier__n_neighbors=3, tfidf__use_idf=False, vectorizer__max_features=350, vectorizer__stop_words=None 
[CV]  classifier__n_neighbors=3, tfidf__use_idf=False, vectorizer__max_features=350, vectorizer__stop_words=english, total=   3.0s
[CV]  classifier__n_neighbors=3, tfidf__use_idf=False, vectorizer__max_features=350, vectorizer__stop_words=None, total=   3.5s
[CV]  classifier__n_neighbors=3, tfidf__use_idf=False, vectorizer__max_features=350, vectorizer__stop_words=None, total=   3.5s
[CV]  classifier__n_neighbors=3, tfidf__use_idf=False, vectorizer__max_features=350, vectorizer__stop_words=None, total=   3.7s

GridSearchCV(cv=3, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('vectorizer', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
       ...owski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform'))]),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'vectorizer__max_features': [250, 300, 350], 'vectorizer__stop_words': ['english', None], 'tfidf__use_idf': [True, False], 'classifier__n_neighbors': [2, 3]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=2)
pd.DataFrame(grid.cv_results_).sort_values(by = 'rank_test_score')

mean_fit_time mean_score_time mean_test_score mean_train_score param_classifier__n_neighbors param_tfidf__use_idf param_vectorizer__max_features param_vectorizer__stop_words params rank_test_score split0_test_score split0_train_score split1_test_score split1_train_score split2_test_score split2_train_score std_fit_time std_score_time std_test_score std_train_score
17 1.170683 4.161584 0.7867 0.89755 3 True 350 None {'classifier__n_neighbors': 3, 'tfidf__use_idf': True, 'vectorizer__max_features': 350, 'vectorizer__stop_words': None} 1 0.788542 0.899190 0.787579 0.895155 0.783978 0.898305 0.100817 0.107977 0.001964 0.001732
15 0.965019 4.175408 0.7812 0.89220 3 True 300 None {'classifier__n_neighbors': 3, 'tfidf__use_idf': True, 'vectorizer__max_features': 300, 'vectorizer__stop_words': None} 2 0.781944 0.887339 0.783378 0.892305 0.778278 0.896955 0.119842 0.236494 0.002148 0.003927
16 0.928742 3.072961 0.7773 0.88515 3 True 350 english {'classifier__n_neighbors': 3, 'tfidf__use_idf': True, 'vectorizer__max_features': 350, 'vectorizer__stop_words': 'e... 3 0.766647 0.890489 0.782778 0.883156 0.782478 0.881806 0.073366 0.057893 0.007535 0.003815
14 0.944672 3.026088 0.7748 0.88720 3 True 300 english {'classifier__n_neighbors': 3, 'tfidf__use_idf': True, 'vectorizer__max_features': 300, 'vectorizer__stop_words': 'e... 4 0.763047 0.892439 0.777678 0.885256 0.783678 0.883906 0.016476 0.507439 0.008665 0.003745
12 0.990014 2.508165 0.7735 0.88150 3 True 250 english {'classifier__n_neighbors': 3, 'tfidf__use_idf': True, 'vectorizer__max_features': 250, 'vectorizer__stop_words': 'e... 5 0.762747 0.883438 0.775578 0.879556 0.782178 0.881506 0.109654 0.137139 0.008068 0.001585
13 0.830728 4.362559 0.7721 0.88935 3 True 250 None {'classifier__n_neighbors': 3, 'tfidf__use_idf': True, 'vectorizer__max_features': 250, 'vectorizer__stop_words': None} 6 0.775345 0.888539 0.764776 0.887356 0.776178 0.892155 0.091845 0.802333 0.005189 0.002042
22 0.727000 2.340584 0.7619 0.87875 3 False 350 english {'classifier__n_neighbors': 3, 'tfidf__use_idf': False, 'vectorizer__max_features': 350, 'vectorizer__stop_words': '... 7 0.754649 0.883738 0.762076 0.878656 0.768977 0.873856 0.043127 0.072686 0.005851 0.004035
20 0.843213 3.285683 0.7576 0.87725 3 False 300 english {'classifier__n_neighbors': 3, 'tfidf__use_idf': False, 'vectorizer__max_features': 300, 'vectorizer__stop_words': '... 8 0.745651 0.882238 0.755476 0.877756 0.771677 0.871756 0.089618 0.071229 0.010731 0.004294
4 0.578924 2.237310 0.7527 0.90920 2 True 350 english {'classifier__n_neighbors': 2, 'tfidf__use_idf': True, 'vectorizer__max_features': 350, 'vectorizer__stop_words': 'e... 9 0.749550 0.911491 0.762376 0.909255 0.746175 0.906855 0.018350 0.628051 0.006979 0.001893
18 0.987835 3.083754 0.7498 0.87475 3 False 250 english {'classifier__n_neighbors': 3, 'tfidf__use_idf': False, 'vectorizer__max_features': 250, 'vectorizer__stop_words': '... 10 0.750150 0.879238 0.743174 0.871156 0.756076 0.873856 0.182437 0.243110 0.005272 0.003359
2 0.580391 1.821766 0.7479 0.90760 2 True 300 english {'classifier__n_neighbors': 2, 'tfidf__use_idf': True, 'vectorizer__max_features': 300, 'vectorizer__stop_words': 'e... 11 0.734553 0.908491 0.756376 0.908355 0.752775 0.905955 0.021492 0.085388 0.009553 0.001165
10 0.797745 2.314199 0.7430 0.91235 2 False 350 english {'classifier__n_neighbors': 2, 'tfidf__use_idf': False, 'vectorizer__max_features': 350, 'vectorizer__stop_words': '... 12 0.745951 0.916442 0.739574 0.915404 0.743474 0.905205 0.078513 0.027637 0.002625 0.005070
8 0.582446 1.912485 0.7391 0.90970 2 False 300 english {'classifier__n_neighbors': 2, 'tfidf__use_idf': False, 'vectorizer__max_features': 300, 'vectorizer__stop_words': '... 13 0.737552 0.909691 0.738674 0.914954 0.741074 0.904455 0.025495 0.020054 0.001469 0.004286
0 0.540779 1.746032 0.7366 0.90365 2 True 250 english {'classifier__n_neighbors': 2, 'tfidf__use_idf': True, 'vectorizer__max_features': 250, 'vectorizer__stop_words': 'e... 14 0.729454 0.906691 0.738374 0.900405 0.741974 0.903855 0.001660 0.020781 0.005263 0.002570
5 0.599577 2.415360 0.7362 0.88985 2 True 350 None {'classifier__n_neighbors': 2, 'tfidf__use_idf': True, 'vectorizer__max_features': 350, 'vectorizer__stop_words': None} 15 0.739352 0.888239 0.734473 0.889756 0.734773 0.891555 0.015238 0.050224 0.002233 0.001356
6 0.547162 1.846357 0.7324 0.90300 2 False 250 english {'classifier__n_neighbors': 2, 'tfidf__use_idf': False, 'vectorizer__max_features': 250, 'vectorizer__stop_words': '... 16 0.732454 0.902490 0.731773 0.904455 0.732973 0.902055 0.006475 0.046370 0.000491 0.001044
3 0.613891 2.380259 0.7270 0.88310 2 True 300 None {'classifier__n_neighbors': 2, 'tfidf__use_idf': True, 'vectorizer__max_features': 300, 'vectorizer__stop_words': None} 17 0.732454 0.880288 0.725773 0.880456 0.722772 0.888556 0.026856 0.002337 0.004047 0.003858
23 0.752127 2.854057 0.7186 0.86370 3 False 350 None {'classifier__n_neighbors': 3, 'tfidf__use_idf': False, 'vectorizer__max_features': 350, 'vectorizer__stop_words': N... 18 0.723155 0.860186 0.714071 0.863507 0.718572 0.867407 0.028106 0.121911 0.003709 0.002951
1 0.617535 2.401295 0.7165 0.87620 2 True 250 None {'classifier__n_neighbors': 2, 'tfidf__use_idf': True, 'vectorizer__max_features': 250, 'vectorizer__stop_words': None} 19 0.721356 0.879088 0.710771 0.875056 0.717372 0.874456 0.045237 0.139353 0.004365 0.002057
21 0.813277 2.858795 0.7136 0.86235 3 False 300 None {'classifier__n_neighbors': 3, 'tfidf__use_idf': False, 'vectorizer__max_features': 300, 'vectorizer__stop_words': N... 20 0.717157 0.862886 0.711671 0.862157 0.711971 0.862007 0.093577 0.103089 0.002518 0.000384
19 0.916645 3.257190 0.7118 0.85780 3 False 250 None {'classifier__n_neighbors': 3, 'tfidf__use_idf': False, 'vectorizer__max_features': 250, 'vectorizer__stop_words': N... 21 0.712058 0.860036 0.712271 0.857207 0.711071 0.856157 0.111296 0.131830 0.000523 0.001638
11 0.863508 3.169554 0.6517 0.82095 2 False 350 None {'classifier__n_neighbors': 2, 'tfidf__use_idf': False, 'vectorizer__max_features': 350, 'vectorizer__stop_words': N... 22 0.651170 0.817582 0.650165 0.821209 0.653765 0.824059 0.063974 0.086127 0.001517 0.002651
9 0.706671 2.628934 0.6447 0.81660 2 False 300 None {'classifier__n_neighbors': 2, 'tfidf__use_idf': False, 'vectorizer__max_features': 300, 'vectorizer__stop_words': N... 23 0.639772 0.812031 0.641764 0.817609 0.652565 0.820159 0.100523 0.220924 0.005620 0.003394
7 0.590949 2.152607 0.6382 0.80815 2 False 250 None {'classifier__n_neighbors': 2, 'tfidf__use_idf': False, 'vectorizer__max_features': 250, 'vectorizer__stop_words': N... 24 0.633473 0.807831 0.632763 0.807410 0.648365 0.809210 0.007630 0.036215 0.007193 0.000769
best_pipeline = grid.best_estimator_

print('Train fit')
train_prediction = best_pipeline.predict(wine_df['description'])

print(classification_report(train_prediction, wine_df['rating']))

print('Test fit')
test_sample = wine_df_full.sample(n = 10000, replace=False)

test_prediction = best_pipeline.predict(test_sample['description'])

print(classification_report(test_prediction, test_sample['rating']))
Train fit
             precision    recall  f1-score   support

       High       0.84      0.90      0.87      3716
        Low       0.94      0.90      0.92      6284

avg / total       0.90      0.90      0.90     10000

Test fit
             precision    recall  f1-score   support

       High       0.70      0.77      0.74      3602
        Low       0.86      0.82      0.84      6398

avg / total       0.81      0.80      0.80     10000


print(accuracy_score(test_prediction, test_sample['rating']))
0.8014