%matplotlib inline

import pandas as pd

pd.set_option('display.max_colwidth', 120)
# 2. Fit the data

vectorizer.fit(wine_df['description'])

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.5, max_features=None, min_df=0.01,
        ngram_range=(1, 1), preprocessor=None, stop_words='english',
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)
len(vectorizer.get_feature_names())
409
# 3. Transform based on the model
review_word_counts = vectorizer.transform(wine_df['description'])

from sklearn.neighbors import KNeighborsClassifier

# 1. Set the parameters
knn_classifier = KNeighborsClassifier(n_neighbors = 3)
# 2. Fit the data
knn_classifier.fit(review_word_counts, wine_df['rating'])

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=3, p=2,
           weights='uniform')
# 3. Transform based on the model

knn_prediction = knn_classifier.predict(review_word_counts)

print(classification_report(wine_df['rating'], knn_prediction))



             precision    recall  f1-score   support

       High       0.98      0.33      0.50      3880
        Low       0.70      1.00      0.82      6120

avg / total       0.81      0.74      0.70     10000


Your turn

What is the f1 score for the model? </div> What about fit on a different data? {:.input_area} ```python wine_df_test = wine_df_full.sample(n = 10000) ``` {:.input_area} ```python # numbers into words # don't rebuild the model, just predict. wdt_tf = vectorizer.transform(wine_df_test['description']) ``` {:.input_area} ```python # don't rebuild the model, just predict. test_prediction = knn_classifier.predict(wdt_tf) ``` {:.input_area} ```python print(classification_report(wine_df['rating'], test_prediction)) ``` {:.output .output_stream} ``` precision recall f1-score support High 0.40 0.07 0.12 3880 Low 0.61 0.93 0.74 6120 avg / total 0.53 0.60 0.50 10000 ``` ![](images/knn2.png)

Your turn

What about changing your model to 6 neighbors? Does it fit better? Do you have the same results as other members of your group? </div> {:.input_area} ```python for n in [2, 4, 6, 12]: print(n) knn_classifier = KNeighborsClassifier(n_neighbors = n) knn_classifier.fit(review_word_counts, wine_df['rating']) train_predict = knn_classifier.predict(review_word_counts) print(accuracy_score(wine_df['rating'], train_predict)) test_predict = knn_classifier.predict(wdt_tf) print(accuracy_score(wine_df_test['rating'], test_predict)) ``` {:.output .output_stream} ``` 2 0.98 0.7381 4 0.7648 0.6853 6 0.7049 0.6616 12 0.6538 0.6343 ``` {:.input_area} ```python from sklearn.model_selection import GridSearchCV ``` {:.input_area} ```python # old model: knn_classifier = KNeighborsClassifier(n_neighbors = 3) parameters = {'n_neighbors' : [2,3, 7], 'weights' : ['distance', 'uniform']} ``` {:.input_area} ```python grid = GridSearchCV(KNeighborsClassifier(), parameters, cv = 5) ``` ![](images/cv.png) {:.input_area} ```python grid.fit(review_word_counts, wine_df['rating']) ``` {:.output .output_data_text} ``` GridSearchCV(cv=5, error_score='raise', estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski', metric_params=None, n_jobs=1, n_neighbors=5, p=2, weights='uniform'), fit_params=None, iid=True, n_jobs=1, param_grid={'n_neighbors': [2, 3, 7], 'weights': ['distance', 'uniform']}, pre_dispatch='2*n_jobs', refit=True, return_train_score='warn', scoring=None, verbose=0) ``` {:.input_area} ```python grid.cv_results_ ``` {:.output .output_data_text} ``` {'mean_fit_time': array([0.01243081, 0.01146436, 0.01231976, 0.01234374, 0.01134434, 0.01248574]), 'mean_score_time': array([0.6254056 , 0.57316837, 0.72712083, 0.66340117, 0.74647989, 0.76884518]), 'mean_test_score': array([0.6857, 0.7012, 0.6585, 0.6561, 0.6451, 0.6423]), 'mean_train_score': array([1. , 0.979925, 1. , 0.734575, 1. , 0.66275 ]), 'param_n_neighbors': masked_array(data=[2, 2, 3, 3, 7, 7], mask=[False, False, False, False, False, False], fill_value='?', dtype=object), 'param_weights': masked_array(data=['distance', 'uniform', 'distance', 'uniform', 'distance', 'uniform'], mask=[False, False, False, False, False, False], fill_value='?', dtype=object), 'params': [{'n_neighbors': 2, 'weights': 'distance'}, {'n_neighbors': 2, 'weights': 'uniform'}, {'n_neighbors': 3, 'weights': 'distance'}, {'n_neighbors': 3, 'weights': 'uniform'}, {'n_neighbors': 7, 'weights': 'distance'}, {'n_neighbors': 7, 'weights': 'uniform'}], 'rank_test_score': array([2, 1, 3, 4, 5, 6], dtype=int32), 'split0_test_score': array([0.6965, 0.706 , 0.656 , 0.654 , 0.645 , 0.6425]), 'split0_train_score': array([1. , 0.978, 1. , 0.737, 1. , 0.665]), 'split1_test_score': array([0.677 , 0.6915, 0.658 , 0.655 , 0.6455, 0.642 ]), 'split1_train_score': array([1. , 0.98175 , 1. , 0.733 , 1. , 0.660875]), 'split2_test_score': array([0.6865, 0.705 , 0.66 , 0.6595, 0.6425, 0.641 ]), 'split2_train_score': array([1. , 0.979375, 1. , 0.73025 , 1. , 0.659875]), 'split3_test_score': array([0.6785, 0.694 , 0.658 , 0.6545, 0.6445, 0.641 ]), 'split3_train_score': array([1. , 0.979875, 1. , 0.7395 , 1. , 0.6685 ]), 'split4_test_score': array([0.69 , 0.7095, 0.6605, 0.6575, 0.648 , 0.645 ]), 'split4_train_score': array([1. , 0.980625, 1. , 0.733125, 1. , 0.6595 ]), 'std_fit_time': array([0.00050485, 0.00067581, 0.00036578, 0.00051288, 0.00045831, 0.00051015]), 'std_score_time': array([0.06634425, 0.01184952, 0.07229286, 0.05769311, 0.01632403, 0.05215541]), 'std_test_score': array([0.00725672, 0.00710352, 0.00161245, 0.00208327, 0.001772 , 0.00146969]), 'std_train_score': array([0. , 0.0012515 , 0. , 0.00326879, 0. , 0.00347761])} ``` {:.input_area} ```python pd.DataFrame(grid.cv_results_) ```

mean_fit_time mean_score_time mean_test_score mean_train_score param_n_neighbors param_weights params rank_test_score split0_test_score split0_train_score ... split2_test_score split2_train_score split3_test_score split3_train_score split4_test_score split4_train_score std_fit_time std_score_time std_test_score std_train_score
0 0.012431 0.625406 0.6857 1.000000 2 distance {'n_neighbors': 2, 'weights': 'distance'} 2 0.6965 1.000 ... 0.6865 1.000000 0.6785 1.000000 0.6900 1.000000 0.000505 0.066344 0.007257 0.000000
1 0.011464 0.573168 0.7012 0.979925 2 uniform {'n_neighbors': 2, 'weights': 'uniform'} 1 0.7060 0.978 ... 0.7050 0.979375 0.6940 0.979875 0.7095 0.980625 0.000676 0.011850 0.007104 0.001251
2 0.012320 0.727121 0.6585 1.000000 3 distance {'n_neighbors': 3, 'weights': 'distance'} 3 0.6560 1.000 ... 0.6600 1.000000 0.6580 1.000000 0.6605 1.000000 0.000366 0.072293 0.001612 0.000000
3 0.012344 0.663401 0.6561 0.734575 3 uniform {'n_neighbors': 3, 'weights': 'uniform'} 4 0.6540 0.737 ... 0.6595 0.730250 0.6545 0.739500 0.6575 0.733125 0.000513 0.057693 0.002083 0.003269
4 0.011344 0.746480 0.6451 1.000000 7 distance {'n_neighbors': 7, 'weights': 'distance'} 5 0.6450 1.000 ... 0.6425 1.000000 0.6445 1.000000 0.6480 1.000000 0.000458 0.016324 0.001772 0.000000
5 0.012486 0.768845 0.6423 0.662750 7 uniform {'n_neighbors': 7, 'weights': 'uniform'} 6 0.6425 0.665 ... 0.6410 0.659875 0.6410 0.668500 0.6450 0.659500 0.000510 0.052155 0.001470 0.003478

6 rows × 22 columns

{:.input_area} ```python grid.best_estimator_ ``` {:.output .output_data_text} ``` KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski', metric_params=None, n_jobs=1, n_neighbors=2, p=2, weights='uniform') ``` {:.input_area} ```python train_prediction = grid.best_estimator_.predict(review_word_counts) print(accuracy_score(wine_df['rating'], train_prediction)) ``` {:.output .output_stream} ``` 0.98 ``` {:.input_area} ```python knn_best_estimator = grid.best_estimator_ test_prediction = knn_best_estimator.predict(wdt_tf) print(classification_report(wine_df_test['rating'], test_prediction)) ``` {:.output .output_stream} ``` precision recall f1-score support High 0.84 0.42 0.56 3956 Low 0.71 0.95 0.81 6044 avg / total 0.76 0.74 0.71 10000 ```

Your turn

What is the optimal settings for k-nearest neighbor model? </div> {:.input_area} ```python from sklearn.pipeline import Pipeline from sklearn.model_selection import GridSearchCV pipeline = Pipeline([ ('vectorizer' , CountVectorizer()), ('classifier' , KNeighborsClassifier()) ]) parameters = {'vectorizer__max_features' : [300, 500, 700], 'classifier__n_neighbors' : [2,3, 5] } ``` {:.input_area} ```python grid = GridSearchCV(pipeline, parameters, n_jobs = -1, cv = 3, return_train_score = True, verbose = 1) ``` {:.input_area} ```python grid.fit(wine_df['description'], wine_df['rating']) ``` {:.output .output_stream} ``` Fitting 3 folds for each of 9 candidates, totalling 27 fits ``` {:.output .output_data_text} ``` GridSearchCV(cv=3, error_score='raise', estimator=Pipeline(memory=None, steps=[('vectorizer', CountVectorizer(analyzer='word', binary=False, decode_error='strict', dtype=<class 'numpy.int64'>, encoding='utf-8', input='content', lowercase=True, max_df=1.0, max_features=None, min_df=1, ngram_range=(1, 1), preprocessor=None, stop_words=None, ...owski', metric_params=None, n_jobs=1, n_neighbors=5, p=2, weights='uniform'))]), fit_params=None, iid=True, n_jobs=-1, param_grid={'vectorizer__max_features': [300, 500, 700], 'classifier__n_neighbors': [2, 3, 5]}, pre_dispatch='2*n_jobs', refit=True, return_train_score=True, scoring=None, verbose=1) ```

Your turn

How does this compare to a logistic regression model? google sklearn logistic regression </div> {:.input_area} ```python pd.DataFrame(grid.cv_results_) ```

mean_fit_time mean_score_time mean_test_score mean_train_score param_classifier__n_neighbors param_vectorizer__max_features params rank_test_score split0_test_score split0_train_score split1_test_score split1_train_score split2_test_score split2_train_score std_fit_time std_score_time std_test_score std_train_score
0 0.598258 2.685512 0.7243 0.952250 2 300 {'classifier__n_neighbors': 2, 'vectorizer__max_features': 300} 1 0.724355 0.949895 0.726673 0.954402 0.721872 0.952452 0.003394 0.018643 0.001960 0.001846
1 0.582437 2.546346 0.7215 0.965650 2 500 {'classifier__n_neighbors': 2, 'vectorizer__max_features': 500} 3 0.727355 0.959646 0.727573 0.968052 0.709571 0.969252 0.013307 0.115461 0.008435 0.004273
2 0.610783 2.521556 0.7236 0.969000 2 700 {'classifier__n_neighbors': 2, 'vectorizer__max_features': 700} 2 0.728554 0.963546 0.727573 0.969402 0.714671 0.974051 0.025366 0.010965 0.006326 0.004298
3 0.609263 2.527058 0.6987 0.803550 3 300 {'classifier__n_neighbors': 3, 'vectorizer__max_features': 300} 4 0.705459 0.805281 0.700270 0.800060 0.690369 0.805310 0.011740 0.004321 0.006260 0.002468
4 0.632821 2.604387 0.6847 0.782850 3 500 {'classifier__n_neighbors': 3, 'vectorizer__max_features': 500} 6 0.691662 0.787579 0.684668 0.781911 0.677768 0.779061 0.007089 0.036815 0.005672 0.003540
5 0.777215 2.901209 0.6807 0.778551 3 700 {'classifier__n_neighbors': 3, 'vectorizer__max_features': 700} 7 0.686563 0.789379 0.680168 0.775461 0.675368 0.770811 0.160461 0.205566 0.004586 0.007889
6 0.906246 3.479473 0.6947 0.756500 5 300 {'classifier__n_neighbors': 5, 'vectorizer__max_features': 300} 5 0.700960 0.757726 0.693669 0.750112 0.689469 0.761662 0.133283 0.092705 0.004748 0.004794
7 0.759941 3.473811 0.6774 0.736050 5 500 {'classifier__n_neighbors': 5, 'vectorizer__max_features': 500} 8 0.687163 0.744224 0.678068 0.732413 0.666967 0.731513 0.042536 0.118061 0.008259 0.005792
8 0.699978 2.871981 0.6760 0.726950 5 700 {'classifier__n_neighbors': 5, 'vectorizer__max_features': 700} 9 0.682364 0.736124 0.675368 0.719064 0.670267 0.725664 0.017384 0.161824 0.004959 0.007024
And now for something different

Your turn

As a group, take a look at the text of the wine descriptions. Ignore the ratings. What different themes do you find? </div> ![](images/lda.jpg) {:.input_area} ```python wine_df['description'].values[250] ``` {:.output .output_data_text} ``` "There's a strong peach-fuzz aroma enveloping this wine. It's unoaked, tasting overly rich and oily." ``` {:.input_area} ```python from sklearn.decomposition import LatentDirichletAllocation ``` {:.input_area} ```python vectorizer = CountVectorizer(lowercase = True, ngram_range = (1,2), max_df = .50, min_df = .01, max_features = None) ``` {:.input_area} ```python vectorizer.fit(wine_df['description']) ``` {:.output .output_data_text} ``` CountVectorizer(analyzer='word', binary=False, decode_error='strict', dtype=<class 'numpy.int64'>, encoding='utf-8', input='content', lowercase=True, max_df=0.5, max_features=None, min_df=0.01, ngram_range=(1, 2), preprocessor=None, stop_words=None, strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, vocabulary=None) ``` {:.input_area} ```python review_word_counts = vectorizer.transform(wine_df['description']) ``` {:.input_area} ```python lda = LatentDirichletAllocation(n_components = 5) lda.fit(review_word_counts) ``` {:.output .output_data_text} ``` LatentDirichletAllocation(batch_size=128, doc_topic_prior=None, evaluate_every=-1, learning_decay=0.7, learning_method=None, learning_offset=10.0, max_doc_update_iter=100, max_iter=10, mean_change_tol=0.001, n_components=5, n_jobs=1, n_topics=None, perp_tol=0.1, random_state=None, topic_word_prior=None, total_samples=1000000.0, verbose=0) ``` {:.input_area} ```python lda.fit(review_word_counts) ``` {:.output .output_data_text} ``` LatentDirichletAllocation(batch_size=128, doc_topic_prior=None, evaluate_every=-1, learning_decay=0.7, learning_method=None, learning_offset=10.0, max_doc_update_iter=100, max_iter=10, mean_change_tol=0.001, n_components=5, n_jobs=1, n_topics=None, perp_tol=0.1, random_state=None, topic_word_prior=None, total_samples=1000000.0, verbose=0) ``` What words are associated with what topics? LatentDirichletAllocation is bad a showing results in a pretty way. {:.input_area} ```python def column_swap(column): column = column.sort_values(ascending = False) return column.index def topic_words_df(lda_model, vectorizer): ''' Generate dataframe of words associated with a topic model. ''' word_topic_scores = lda_model.components_.T vocabulary = vectorizer.get_feature_names() topic_words_df = pd.DataFrame(word_topic_scores, index = vocabulary) topic_words_df = topic_words_df.apply(column_swap).reset_index(drop = True).rename_axis('rank') topic_words_df.index = topic_words_df.index + 1 return topic_words_df ``` {:.input_area} ```python topic_words_df(lda, vectorizer).head(10) ```

0 1 2 3 4
rank
1 flavors wine flavors aromas in
2 on to in palate on
3 palate drink wine finish black
4 apple that blend on oak
5 on the from cabernet flavors flavors
6 citrus ripe red aromas of to
7 acidity acidity soft cherry chocolate
8 finish fruit fruit berry wine
9 to has that fruit in the
10 wine tannins blend of on the on the

Your turn

As a group, try different options for your vectorizer and number of topics. What set of parameters creates the most coherent topics? </div>

Your turn

What were the major themes in Donald Trump campaign speeches? </div> {:.input_area} ```python ge_df = pd.read_json('data/ge_speeches.json') ``` {:.input_area} ```python vectorizer = CountVectorizer(lowercase = True, ngram_range = (1,1), stop_words = ['lot', 'percent'], max_df = .60, min_df = .01, max_features = None) vectorizer.fit(ge_df['text']) ``` {:.output .output_data_text} ``` CountVectorizer(analyzer='word', binary=False, decode_error='strict', dtype=<class 'numpy.int64'>, encoding='utf-8', input='content', lowercase=True, max_df=0.6, max_features=None, min_df=0.01, ngram_range=(1, 1), preprocessor=None, stop_words=['lot', 'percent'], strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, vocabulary=None) ``` {:.input_area} ```python ge_tf = vectorizer.transform(ge_df['text']) ``` {:.input_area} ```python lda = LatentDirichletAllocation(n_components = 10) lda.fit(ge_tf) topic_words_df(lda, vectorizer).head(10) ```

0 1 2 3 4 5 6 7 8 9
rank
1 illegal him cyber kids isis citizens growth cities folks folks
2 immigration kids review him terrorism administration trillion regulations doesn bad
3 administration friends attacks everybody radical illegal income follow bad isis
4 obamacare each justice kind islamic immigration childcare donors cities doesn
5 folks gwen department something defense high regulations policies everybody pennsylvania
6 mexico black defense college east failed savings rebuild isis illegal
7 border doesn best each iraq 10 wealth policy failed ago
8 cities joyce both friends war kids china citizens didn ok
9 companies kind enforcement still policy best cost interests thing story
10 borders marian crime doesn immigration everybody rate deals companies companies
What documents are associated with what topics? {:.input_area} ```python wine_topics = lda.transform(review_word_counts) ``` {:.input_area} ```python wine_topics ``` {:.output .output_data_text} ``` array([[0.01820025, 0.29652361, 0.11186351, 0.33473465, 0.23867797], [0.0250004 , 0.02517967, 0.02500055, 0.02527369, 0.89954568], [0.05000828, 0.05000008, 0.05000007, 0.79967762, 0.05031396], ..., [0.39021072, 0.03333402, 0.03333387, 0.0339383 , 0.5091831 ], [0.05073342, 0.05000071, 0.05046833, 0.79774203, 0.05105551], [0.04055079, 0.04000018, 0.04000014, 0.83770459, 0.0417443 ]]) ``` {:.input_area} ```python pd.DataFrame(wine_topics).head(10) ```
0 1 2 3 4
0 0.018200 0.296524 0.111864 0.334735 0.238678
1 0.025000 0.025180 0.025001 0.025274 0.899546
2 0.050008 0.050000 0.050000 0.799678 0.050314
3 0.259258 0.020000 0.020168 0.680060 0.020514
4 0.022322 0.133351 0.022223 0.022982 0.799122
5 0.082002 0.012501 0.012516 0.012655 0.880326
6 0.016780 0.016670 0.016672 0.016750 0.933127
7 0.484423 0.025001 0.149840 0.315591 0.025144
8 0.113505 0.346898 0.012549 0.212557 0.314491
9 0.015924 0.015392 0.015385 0.937815 0.015484
We can now use our topics as features {:.input_area} ```python knn_classifier = KNeighborsClassifier(n_neighbors = 3, weights = 'distance') knn_classifier.fit(wine_topics, wine_df['rating']) ``` {:.output .output_data_text} ``` KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski', metric_params=None, n_jobs=1, n_neighbors=3, p=2, weights='distance') ``` {:.input_area} ```python train_prediction = knn_classifier.predict(wine_topics) ``` {:.input_area} ```python print(accuracy_score(wine_df['rating'], train_prediction)) ``` {:.output .output_stream} ``` 1.0 ``` {:.input_area} ```python test_tf = vectorizer.transform(wine_df_test['description']) test_topics = lda.transform(test_tf) test_prediction = knn_classifier.predict(test_topics) ``` {:.input_area} ```python print(classification_report(wine_df_test['rating'], test_prediction)) ``` {:.output .output_stream} ``` precision recall f1-score support High 0.64 0.59 0.61 3956 Low 0.74 0.78 0.76 6044 avg / total 0.70 0.71 0.70 10000 ```

Your turn

Using your best topic model, what is the prediction rate for your best k nearest neighbors model? </div> Let's do it again, but with a different data set {:.input_area} ```python bg_df = pd.read_csv('data/boardgames.csv') ``` {:.input_area} ```python bg_df.info() ``` {:.output .output_stream} ``` <class 'pandas.core.frame.DataFrame'> RangeIndex: 13720 entries, 0 to 13719 Data columns (total 58 columns): Unnamed: 0 13720 non-null int64 index 13720 non-null int64 name 13720 non-null object description 13720 non-null object max_players 13720 non-null float64 min_players 13720 non-null float64 min_playtime 13720 non-null float64 max_playtime 13720 non-null float64 min_age 13720 non-null float64 category 13720 non-null object mechanics 13720 non-null object year_published 13720 non-null float64 weighted_score 13720 non-null float64 number_of_comments 13720 non-null float64 complexity 13720 non-null float64 number_owned 13720 non-null float64 number_raters 13720 non-null float64 category_cardgame 13720 non-null bool category_wargame 13720 non-null bool category_fantasy 13720 non-null bool category_dice 13720 non-null bool category_partygame 13720 non-null bool category_fighting 13720 non-null bool category_sciencefiction 13720 non-null bool category_abstractstrategy 13720 non-null bool category_economic 13720 non-null bool category_childrensgame 13720 non-null bool category_worldwarii 13720 non-null bool category_bluffing 13720 non-null bool category_animals 13720 non-null bool category_humor 13720 non-null bool category_actiondexterity 13720 non-null bool category_adventure 13720 non-null bool category_moviestvradiotheme 13720 non-null bool category_medieval 13720 non-null bool category_deduction 13720 non-null bool category_miniatures 13720 non-null bool mechanic_dicerolling 13720 non-null bool mechanic_handmanagement 13720 non-null bool mechanic_hexandcounter 13720 non-null bool mechanic_setcollection 13720 non-null bool mechanic_variableplayerpowers 13720 non-null bool mechanic_none 13720 non-null bool mechanic_tileplacement 13720 non-null bool mechanic_modularboard 13720 non-null bool mechanic_carddrafting 13720 non-null bool mechanic_rollspinandmove 13720 non-null bool mechanic_areacontrolareainfluence 13720 non-null bool mechanic_auctionbidding 13720 non-null bool mechanic_simulation 13720 non-null bool mechanic_areamovement 13720 non-null bool mechanic_simultaneousactionselection 13720 non-null bool mechanic_actionpointallowancesystem 13720 non-null bool mechanic_cooperativeplay 13720 non-null bool mechanic_pointtopointmovement 13720 non-null bool mechanic_partnerships 13720 non-null bool mechanic_memory 13720 non-null bool quality_game 13720 non-null bool dtypes: bool(41), float64(11), int64(2), object(4) memory usage: 2.3+ MB ``` {:.input_area} ```python bg_df.head() ```

Your turn

Load up this dataset in your other workbook. Topic model the game descriptions. </div> {:.input_area} ```python from sklearn.feature_extraction.text import CountVectorizer ``` {:.input_area} ```python vectorizer = CountVectorizer(max_df=.6, min_df=.01, stop_words= 'english') ``` {:.input_area} ```python vectorizer.fit(bg_df['description']) ``` {:.output .output_data_text} ``` CountVectorizer(analyzer='word', binary=False, decode_error='strict', dtype=<class 'numpy.int64'>, encoding='utf-8', input='content', lowercase=True, max_df=0.6, max_features=None, min_df=0.01, ngram_range=(1, 1), preprocessor=None, stop_words='english', strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, vocabulary=None) ``` {:.input_area} ```python bg_wf = vectorizer.transform(bg_df['description']) ``` {:.input_area} ```python pd.DataFrame(bg_wf.todense(), columns=vectorizer.get_feature_names()).sum().sort_values().tail() ``` {:.output .output_data_text} ``` points 6660 play 8171 card 11868 cards 18295 player 20632 dtype: int64 ``` {:.input_area} ```python len(vectorizer.get_feature_names()) ``` {:.output .output_data_text} ``` 1350 ``` {:.input_area} ```python from sklearn.decomposition import LatentDirichletAllocation ``` {:.input_area} ```python lda = LatentDirichletAllocation(n_components = 10, n_jobs = -1, learning_method = 'online') ``` {:.input_area} ```python lda.fit(bg_wf) ``` {:.output .output_data_text} ``` LatentDirichletAllocation(batch_size=128, doc_topic_prior=None, evaluate_every=-1, learning_decay=0.7, learning_method='online', learning_offset=10.0, max_doc_update_iter=100, max_iter=10, mean_change_tol=0.001, n_components=10, n_jobs=-1, n_topics=None, perp_tol=0.1, random_state=None, topic_word_prior=None, total_samples=1000000.0, verbose=0) ``` {:.input_area} ```python topics = lda.transform(bg_wf) ``` {:.input_area} ```python topics ``` {:.output .output_data_text} ``` array([[0.03679355, 0.00128221, 0.16932467, ..., 0.00128258, 0.23042156, 0.21902049], [0.00166734, 0.04633801, 0.00166712, ..., 0.66111848, 0.00166688, 0.00166742], [0.13538142, 0.00178592, 0.0017863 , ..., 0.2467892 , 0.00178637, 0.00178612], ..., [0.00400069, 0.00400016, 0.13058438, ..., 0.00400077, 0.04660411, 0.00400005], [0.00108719, 0.05541306, 0.06481335, ..., 0.00108721, 0.19867116, 0.00108727], [0.20425653, 0.00175463, 0.38152976, ..., 0.00175504, 0.00175484, 0.043443 ]]) ``` {:.input_area} ```python pd.DataFrame(topics) ```

0 1 2 3 4 5 6 7 8 9
0 0.036794 0.001282 0.169325 0.001282 0.001282 0.338028 0.001282 0.001283 0.230422 0.219020
1 0.001667 0.046338 0.001667 0.184943 0.001667 0.097597 0.001667 0.661118 0.001667 0.001667
2 0.135381 0.001786 0.001786 0.055077 0.077460 0.476361 0.001786 0.246789 0.001786 0.001786
3 0.416140 0.003704 0.003705 0.003704 0.003704 0.274632 0.003704 0.003705 0.066949 0.220052
4 0.001924 0.001923 0.046196 0.001923 0.001923 0.538005 0.001923 0.001923 0.402336 0.001923
5 0.069201 0.003449 0.251228 0.003449 0.003449 0.273391 0.003449 0.003449 0.121061 0.267874
6 0.603030 0.002381 0.080042 0.002381 0.230618 0.072023 0.002381 0.002381 0.002381 0.002381
7 0.003705 0.003705 0.399030 0.065456 0.003705 0.509580 0.003705 0.003706 0.003705 0.003704
8 0.002000 0.002000 0.376853 0.002000 0.002001 0.607145 0.002000 0.002000 0.002000 0.002000
9 0.163626 0.000427 0.027896 0.000427 0.000427 0.368236 0.051747 0.386358 0.000427 0.000427
10 0.001786 0.041278 0.168536 0.001786 0.001786 0.001786 0.001786 0.256963 0.522507 0.001786
11 0.001352 0.001352 0.161444 0.001352 0.001352 0.802978 0.026117 0.001352 0.001352 0.001351
12 0.224039 0.023161 0.000568 0.131756 0.000568 0.312246 0.000568 0.131744 0.174781 0.000568
13 0.231017 0.001191 0.001191 0.001191 0.023955 0.594456 0.001191 0.143427 0.001191 0.001191
14 0.002565 0.002564 0.002565 0.613030 0.002565 0.002565 0.040237 0.002565 0.328778 0.002564
15 0.298661 0.158551 0.001667 0.001667 0.001667 0.425807 0.001667 0.054123 0.001667 0.054522
16 0.366926 0.003572 0.003573 0.003572 0.214681 0.003572 0.305771 0.003572 0.003572 0.091189
17 0.102983 0.000807 0.361239 0.034604 0.117067 0.054262 0.000807 0.290779 0.036645 0.000807
18 0.131924 0.000820 0.601042 0.000820 0.000820 0.110248 0.011614 0.000820 0.000820 0.141073
19 0.260034 0.001961 0.001961 0.001961 0.332711 0.090885 0.171884 0.049259 0.087382 0.001961
20 0.031763 0.095154 0.317997 0.000769 0.000769 0.089795 0.363318 0.000769 0.098896 0.000769
21 0.006669 0.006669 0.006668 0.006667 0.006671 0.006669 0.808175 0.006670 0.138474 0.006668
22 0.192892 0.003572 0.003573 0.003573 0.333492 0.108656 0.003573 0.003572 0.343526 0.003572
23 0.005003 0.005002 0.093796 0.091570 0.241970 0.205297 0.299664 0.047696 0.005002 0.005002
24 0.002000 0.002001 0.127625 0.002000 0.072620 0.364125 0.002001 0.367634 0.057993 0.002001
25 0.004763 0.004762 0.004764 0.004762 0.710556 0.004763 0.111830 0.004763 0.004763 0.144274
26 0.110387 0.003572 0.286325 0.003572 0.003572 0.065960 0.003572 0.049891 0.469576 0.003572
27 0.427984 0.006668 0.006669 0.006667 0.151561 0.006669 0.373775 0.006668 0.006671 0.006667
28 0.277243 0.001755 0.104799 0.250019 0.001755 0.038028 0.321137 0.001755 0.001756 0.001755
29 0.003227 0.003226 0.003227 0.003226 0.088290 0.003226 0.885896 0.003228 0.003227 0.003227
... ... ... ... ... ... ... ... ... ... ...
13690 0.001389 0.147356 0.102580 0.001389 0.156851 0.036481 0.001389 0.185860 0.253403 0.113301
13691 0.333421 0.000827 0.072009 0.000827 0.000827 0.485963 0.000827 0.103647 0.000827 0.000826
13692 0.196621 0.002632 0.169927 0.050161 0.109173 0.002633 0.204476 0.002633 0.259113 0.002632
13693 0.407877 0.001087 0.212051 0.001087 0.001087 0.263763 0.001087 0.109786 0.001087 0.001087
13694 0.001613 0.300385 0.080069 0.179955 0.245212 0.001613 0.001613 0.001613 0.165463 0.022463
13695 0.002439 0.002439 0.182178 0.068554 0.092012 0.002440 0.301052 0.156707 0.189739 0.002439
13696 0.240402 0.003846 0.296007 0.003847 0.003847 0.301290 0.139221 0.003847 0.003847 0.003847
13697 0.114267 0.002128 0.055188 0.002128 0.028766 0.002128 0.002128 0.733954 0.002128 0.057186
13698 0.001695 0.001695 0.318692 0.001695 0.001695 0.667746 0.001695 0.001696 0.001695 0.001696
13699 0.002942 0.002942 0.220434 0.002942 0.273285 0.138406 0.350223 0.002942 0.002942 0.002942
13700 0.002273 0.002273 0.208484 0.028788 0.002273 0.002273 0.291414 0.211473 0.248476 0.002273
13701 0.002084 0.002084 0.309668 0.002083 0.002084 0.002084 0.321541 0.187172 0.169118 0.002084
13702 0.000794 0.000794 0.297327 0.104246 0.000794 0.154357 0.000794 0.418542 0.021558 0.000794
13703 0.002326 0.002326 0.128596 0.051862 0.465273 0.340313 0.002326 0.002326 0.002326 0.002326
13704 0.140096 0.004762 0.488923 0.004762 0.004763 0.126639 0.004763 0.215765 0.004763 0.004763
13705 0.003226 0.003227 0.409906 0.084880 0.003226 0.003227 0.162181 0.287394 0.003226 0.039507
13706 0.001282 0.398421 0.251342 0.154234 0.001282 0.001282 0.057943 0.001283 0.093845 0.039084
13707 0.131354 0.003846 0.003847 0.003846 0.003847 0.003848 0.003847 0.241739 0.599979 0.003847
13708 0.206511 0.071463 0.000550 0.000550 0.073339 0.131220 0.214766 0.300503 0.000550 0.000549
13709 0.607212 0.003449 0.238582 0.003449 0.003450 0.130061 0.003449 0.003449 0.003450 0.003449
13710 0.141811 0.003448 0.747494 0.086553 0.003449 0.003449 0.003449 0.003450 0.003449 0.003449
13711 0.005884 0.005883 0.251928 0.005883 0.005883 0.005884 0.005883 0.544371 0.162518 0.005883
13712 0.002273 0.002273 0.317546 0.002274 0.141929 0.286050 0.157851 0.050934 0.002273 0.036597
13713 0.005557 0.005556 0.590524 0.005557 0.005558 0.242133 0.005557 0.005558 0.005556 0.128443
13714 0.615493 0.088824 0.001111 0.001111 0.001111 0.001112 0.001111 0.287903 0.001112 0.001111
13715 0.921552 0.000807 0.000807 0.000807 0.000806 0.000807 0.000806 0.071996 0.000807 0.000806
13716 0.404989 0.003031 0.003031 0.003030 0.003030 0.003031 0.084891 0.488905 0.003031 0.003030
13717 0.004001 0.004000 0.130584 0.175442 0.004001 0.623367 0.004000 0.004001 0.046604 0.004000
13718 0.001087 0.055413 0.064813 0.001087 0.001087 0.198599 0.477068 0.001087 0.198671 0.001087
13719 0.204257 0.001755 0.381530 0.001755 0.001755 0.360242 0.001755 0.001755 0.001755 0.043443

13720 rows × 10 columns

{:.input_area} ```python def column_swap(column): column = column.sort_values(ascending = False) return column.index def topic_words_df(lda_model, vectorizer): ''' Generate dataframe of words associated with a topic model. ''' word_topic_scores = lda_model.components_.T vocabulary = vectorizer.get_feature_names() topic_words_df = pd.DataFrame(word_topic_scores, index = vocabulary) topic_words_df = topic_words_df.apply(column_swap).reset_index(drop = True).rename_axis('rank') topic_words_df.index = topic_words_df.index + 1 return topic_words_df ``` {:.input_area} ```python top_words = topic_words_df(lda, vectorizer) ``` {:.input_area} ```python top_words.head(10) ```
0 1 2 3 4 5 6 7 8 9
rank
1 player rules time battle war player characters cards games french
2 dice campaign play attack units points character card rules british
3 board counters make victory combat city heroes player new north
4 tiles german win army player building treasure play edition american
5 pieces battle ll fight map end monsters hand play 000
6 tile scale best master forces turn dragon deck set town
7 die scenarios way mission world build adventure points version strength
8 roll map just enemy unit actions dungeon round original south
9 turn army race power movement money magic turn includes save
10 points scenario need battles command new hero played box river
{:.input_area} ```python def lda_predict(model, tf_matrix): prediction = model.transform(tf_matrix) return pd.DataFrame(prediction) ``` {:.input_area} ```python lda_predict(lda, bg_wf) ``` What about a different method? {:.input_area} ```python from sklearn.tree import DecisionTreeClassifier from sklearn.tree import export_graphviz from IPython.display import Image dtc = DecisionTreeClassifier(max_depth = 3, # Split the sample only three times. min_samples_leaf = 10) # Make sure each leaf ``` {:.input_area} ```python x_names = ['max_players', 'min_players', 'min_playtime', 'max_playtime', 'min_age'] dtc.fit(bg_df[x_names], bg_df['quality_game']) ``` {:.output .output_data_text} ``` DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=3, max_features=None, max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, min_samples_leaf=10, min_samples_split=2, min_weight_fraction_leaf=0.0, presort=False, random_state=None, splitter='best') ``` {:.input_area} ```python export_graphviz(dtc, out_file='dtc.dot', feature_names=x_names) !dot -Tpng dtc.dot -o dtc.png Image(filename='dtc.png') ``` {:.output .output_png} ![png](../images/analysis/topicmodels_90_0.png) {:.input_area} ```python from sklearn.ensemble import RandomForestClassifier ``` {:.input_area} ```python rf = RandomForestClassifier() rf ``` {:.output .output_data_text} ``` RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini', max_depth=None, max_features='auto', max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1, oob_score=False, random_state=None, verbose=0, warm_start=False) ``` {:.input_area} ```python rf.fit(bg_df[x_names], bg_df['quality_game']) ``` {:.output .output_data_text} ``` RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini', max_depth=None, max_features='auto', max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1, oob_score=False, random_state=None, verbose=0, warm_start=False) ``` {:.input_area} ```python imp = pd.DataFrame(rf.feature_importances_, index = x_names) imp ```
0
max_players 0.241430
min_players 0.108830
min_playtime 0.203080
max_playtime 0.237414
min_age 0.209246
{:.input_area} ```python categories = ['category_cardgame', 'category_wargame', 'category_fantasy', 'category_dice', 'category_partygame', 'category_fighting', 'category_sciencefiction', 'category_abstractstrategy', 'category_economic', 'category_childrensgame', 'category_worldwarii', 'category_bluffing', 'category_animals', 'category_humor', 'category_actiondexterity', 'category_adventure', 'category_moviestvradiotheme', 'category_medieval', 'category_deduction', 'category_miniatures'] mechanics = ['mechanic_dicerolling', 'mechanic_handmanagement', 'mechanic_hexandcounter', 'mechanic_setcollection', 'mechanic_variableplayerpowers', 'mechanic_none', 'mechanic_tileplacement', 'mechanic_modularboard', 'mechanic_carddrafting', 'mechanic_rollspinandmove', 'mechanic_areacontrolareainfluence', 'mechanic_auctionbidding', 'mechanic_simulation', 'mechanic_areamovement', 'mechanic_simultaneousactionselection', 'mechanic_actionpointallowancesystem', 'mechanic_cooperativeplay', 'mechanic_pointtopointmovement', 'mechanic_partnerships', 'mechanic_memory'] ``` {:.input_area} ```python x_names = ['complexity', 'max_players', 'min_players', 'min_playtime', 'max_playtime', 'min_age'] x_names = x_names + mechanics + categories ``` {:.input_area} ```python # Google "sklearn random forest" from sklearn.model_selection import GridSearchCV param_dist = {"max_features": [4], "min_samples_split": [10], "class_weight" : ["balanced", None], "n_estimators" : [20, 25, 30, 35]} rfgs = GridSearchCV( RandomForestClassifier(), param_dist, cv = 5, verbose=1 ) ``` {:.input_area} ```python rfgs.fit(bg_df[x_names], bg_df['quality_game']) ``` {:.output .output_stream} ``` Fitting 5 folds for each of 8 candidates, totalling 40 fits ``` {:.output .output_data_text} ``` GridSearchCV(cv=5, error_score='raise', estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini', max_depth=None, max_features='auto', max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1, oob_score=False, random_state=None, verbose=0, warm_start=False), fit_params=None, iid=True, n_jobs=1, param_grid={'max_features': [4], 'min_samples_split': [10], 'class_weight': ['balanced', None], 'n_estimators': [20, 25, 30, 35]}, pre_dispatch='2*n_jobs', refit=True, return_train_score='warn', scoring=None, verbose=1) ``` {:.input_area} ```python rfgs.best_estimator_ ``` {:.output .output_data_text} ``` RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini', max_depth=None, max_features=4, max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, min_samples_leaf=1, min_samples_split=10, min_weight_fraction_leaf=0.0, n_estimators=20, n_jobs=1, oob_score=False, random_state=None, verbose=0, warm_start=False) ``` {:.input_area} ```python rf_best = rfgs.best_estimator_ pd.DataFrame(rf_best.feature_importances_, index = x_names).sort_values(by = 0) ```
0
category_childrensgame 0.003584
category_actiondexterity 0.004004
category_partygame 0.004981
category_worldwarii 0.005507
category_moviestvradiotheme 0.006864
category_abstractstrategy 0.007163
category_humor 0.007265
mechanic_cooperativeplay 0.007869
mechanic_simulation 0.007869
category_miniatures 0.008260
category_dice 0.008619
mechanic_memory 0.008779
category_deduction 0.008781
mechanic_rollspinandmove 0.008850
category_adventure 0.009006
category_animals 0.009293
mechanic_areamovement 0.009405
category_bluffing 0.010340
category_sciencefiction 0.010505
mechanic_actionpointallowancesystem 0.011711
category_medieval 0.011962
category_fighting 0.012082
mechanic_auctionbidding 0.012168
category_fantasy 0.012281
mechanic_none 0.012621
mechanic_pointtopointmovement 0.013348
category_cardgame 0.013492
mechanic_partnerships 0.013829
mechanic_tileplacement 0.013831
mechanic_simultaneousactionselection 0.014041
mechanic_hexandcounter 0.016242
mechanic_modularboard 0.017134
category_wargame 0.017296
mechanic_dicerolling 0.017726
category_economic 0.018247
mechanic_setcollection 0.019286
mechanic_variableplayerpowers 0.024662
mechanic_carddrafting 0.026967
mechanic_areacontrolareainfluence 0.028610
min_players 0.029299
mechanic_handmanagement 0.033756
max_players 0.059831
min_playtime 0.064849
min_age 0.067872
max_playtime 0.077706
complexity 0.192207
{:.input_area} ```python results = pd.DataFrame(rfgs.cv_results_) results ```
mean_fit_time mean_score_time mean_test_score mean_train_score param_class_weight param_max_features param_min_samples_split param_n_estimators params rank_test_score ... split2_test_score split2_train_score split3_test_score split3_train_score split4_test_score split4_train_score std_fit_time std_score_time std_test_score std_train_score
0 0.035971 0.003265 0.692347 0.766254 balanced 4 10 3 {'class_weight': 'balanced', 'max_features': 4, 'min_samples_split': 10, 'n_estimators': 3} 12 ... 0.745262 0.749271 0.690849 0.765510 0.597886 0.769518 0.002877 0.000604 0.060190 0.009190
1 0.044662 0.003466 0.698324 0.783473 balanced 4 10 5 {'class_weight': 'balanced', 'max_features': 4, 'min_samples_split': 10, 'n_estimators': 5} 11 ... 0.763120 0.775966 0.705432 0.783912 0.612104 0.773982 0.004239 0.000408 0.051645 0.007626
2 0.083042 0.005658 0.712245 0.789450 balanced 4 10 10 {'class_weight': 'balanced', 'max_features': 4, 'min_samples_split': 10, 'n_estimators': 10} 9 ... 0.770408 0.791181 0.711994 0.788922 0.617572 0.785461 0.001559 0.000153 0.056820 0.003434
3 0.127855 0.008309 0.716399 0.800784 balanced 4 10 15 {'class_weight': 'balanced', 'max_features': 4, 'min_samples_split': 10, 'n_estimators': 15} 7 ... 0.765671 0.794916 0.723296 0.800401 0.622676 0.801221 0.006741 0.000515 0.054566 0.005317
4 0.202073 0.013132 0.711953 0.795536 balanced 4 10 25 {'class_weight': 'balanced', 'max_features': 4, 'min_samples_split': 10, 'n_estimators': 25} 10 ... 0.763848 0.788448 0.720744 0.795117 0.608458 0.791382 0.002880 0.000554 0.056656 0.007897
5 0.426163 0.029572 0.715889 0.800091 balanced 4 10 50 {'class_weight': 'balanced', 'max_features': 4, 'min_samples_split': 10, 'n_estimators': 50} 8 ... 0.777697 0.792912 0.726577 0.804409 0.609552 0.797030 0.012709 0.002753 0.059357 0.008176
6 0.032092 0.002788 0.827551 0.875638 None 4 10 3 {'class_weight': None, 'max_features': 4, 'min_samples_split': 10, 'n_estimators': 3} 6 ... 0.860423 0.872540 0.840685 0.873098 0.741524 0.878564 0.001444 0.000186 0.043707 0.002381
7 0.065479 0.005214 0.828717 0.878280 None 4 10 5 {'class_weight': None, 'max_features': 4, 'min_samples_split': 10, 'n_estimators': 5} 5 ... 0.861152 0.874453 0.841415 0.877471 0.748451 0.882664 0.001335 0.000455 0.040815 0.002629
8 0.176274 0.010163 0.838557 0.881013 None 4 10 10 {'class_weight': None, 'max_features': 4, 'min_samples_split': 10, 'n_estimators': 10} 1 ... 0.863338 0.878007 0.849435 0.879840 0.773970 0.885215 0.056332 0.001918 0.032840 0.002381
9 0.220291 0.013547 0.834913 0.881378 None 4 10 15 {'class_weight': None, 'max_features': 4, 'min_samples_split': 10, 'n_estimators': 15} 4 ... 0.864431 0.878371 0.840321 0.879111 0.764127 0.884941 0.026049 0.001387 0.036385 0.002402
10 0.290428 0.020428 0.836152 0.881651 None 4 10 25 {'class_weight': None, 'max_features': 4, 'min_samples_split': 10, 'n_estimators': 25} 3 ... 0.863703 0.878098 0.845425 0.879384 0.767043 0.885123 0.041821 0.001880 0.035300 0.002584
11 0.487544 0.026016 0.836662 0.882507 None 4 10 50 {'class_weight': None, 'max_features': 4, 'min_samples_split': 10, 'n_estimators': 50} 2 ... 0.865160 0.879464 0.846518 0.880113 0.765950 0.885579 0.074376 0.001378 0.035935 0.002363

12 rows × 24 columns

Your super big challenge

You want to make a quality game. Based on this dataset, what sort of game should you make? Use a random forest model to find the best set up parameters.

Bonus challenge: Use both features in the data set and ones you construct from a topic model! </div> {:.input_area} ```python rf_prediction = rf_best.predict_proba(bg_df[x_names]) ``` {:.input_area} ```python from sklearn.calibration import calibration_curve def calplot(y_observed, y_predicted): rf_y, rf_x = calibration_curve(y_observed, y_predicted[:,1], n_bins=10) pd.DataFrame([rf_x , rf_y]).T.plot.scatter(x=0, y=1, figsize = (5,5)) ``` {:.input_area} ```python calplot(bg_df['quality_game'], rf_prediction) ``` {:.input_area} ```python idf = pd.Series(rf_best.feature_importances_, index = x_names) idf.sort_values() ``` {:.input_area} ```python idf.sort_values().plot(kind='barh', ) ``` {:.input_area} ```python bg_df.keys() ```