%matplotlib inline
import pandas as pd
pd.set_option('display.max_colwidth', 120)
# 2. Fit the data
vectorizer.fit(wine_df['description'])
CountVectorizer(analyzer='word', binary=False, decode_error='strict',
dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
lowercase=True, max_df=0.5, max_features=None, min_df=0.01,
ngram_range=(1, 1), preprocessor=None, stop_words='english',
strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
tokenizer=None, vocabulary=None)
len(vectorizer.get_feature_names())
409
# 3. Transform based on the model
review_word_counts = vectorizer.transform(wine_df['description'])
from sklearn.neighbors import KNeighborsClassifier
# 1. Set the parameters
knn_classifier = KNeighborsClassifier(n_neighbors = 3)
# 2. Fit the data
knn_classifier.fit(review_word_counts, wine_df['rating'])
KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
metric_params=None, n_jobs=1, n_neighbors=3, p=2,
weights='uniform')
# 3. Transform based on the model
knn_prediction = knn_classifier.predict(review_word_counts)
print(classification_report(wine_df['rating'], knn_prediction))
precision recall f1-score support
High 0.98 0.33 0.50 3880
Low 0.70 1.00 0.82 6120
avg / total 0.81 0.74 0.70 10000
Your turn
What is the f1 score for the model? </div> What about fit on a different data? {:.input_area} ```python wine_df_test = wine_df_full.sample(n = 10000) ``` {:.input_area} ```python # numbers into words # don't rebuild the model, just predict. wdt_tf = vectorizer.transform(wine_df_test['description']) ``` {:.input_area} ```python # don't rebuild the model, just predict. test_prediction = knn_classifier.predict(wdt_tf) ``` {:.input_area} ```python print(classification_report(wine_df['rating'], test_prediction)) ``` {:.output .output_stream} ``` precision recall f1-score support High 0.40 0.07 0.12 3880 Low 0.61 0.93 0.74 6120 avg / total 0.53 0.60 0.50 10000 ``` 
Your turn
What about changing your model to 6 neighbors? Does it fit better? Do you have the same results as other members of your group? </div> {:.input_area} ```python for n in [2, 4, 6, 12]: print(n) knn_classifier = KNeighborsClassifier(n_neighbors = n) knn_classifier.fit(review_word_counts, wine_df['rating']) train_predict = knn_classifier.predict(review_word_counts) print(accuracy_score(wine_df['rating'], train_predict)) test_predict = knn_classifier.predict(wdt_tf) print(accuracy_score(wine_df_test['rating'], test_predict)) ``` {:.output .output_stream} ``` 2 0.98 0.7381 4 0.7648 0.6853 6 0.7049 0.6616 12 0.6538 0.6343 ``` {:.input_area} ```python from sklearn.model_selection import GridSearchCV ``` {:.input_area} ```python # old model: knn_classifier = KNeighborsClassifier(n_neighbors = 3) parameters = {'n_neighbors' : [2,3, 7], 'weights' : ['distance', 'uniform']} ``` {:.input_area} ```python grid = GridSearchCV(KNeighborsClassifier(), parameters, cv = 5) ```  {:.input_area} ```python grid.fit(review_word_counts, wine_df['rating']) ``` {:.output .output_data_text} ``` GridSearchCV(cv=5, error_score='raise', estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski', metric_params=None, n_jobs=1, n_neighbors=5, p=2, weights='uniform'), fit_params=None, iid=True, n_jobs=1, param_grid={'n_neighbors': [2, 3, 7], 'weights': ['distance', 'uniform']}, pre_dispatch='2*n_jobs', refit=True, return_train_score='warn', scoring=None, verbose=0) ``` {:.input_area} ```python grid.cv_results_ ``` {:.output .output_data_text} ``` {'mean_fit_time': array([0.01243081, 0.01146436, 0.01231976, 0.01234374, 0.01134434, 0.01248574]), 'mean_score_time': array([0.6254056 , 0.57316837, 0.72712083, 0.66340117, 0.74647989, 0.76884518]), 'mean_test_score': array([0.6857, 0.7012, 0.6585, 0.6561, 0.6451, 0.6423]), 'mean_train_score': array([1. , 0.979925, 1. , 0.734575, 1. , 0.66275 ]), 'param_n_neighbors': masked_array(data=[2, 2, 3, 3, 7, 7], mask=[False, False, False, False, False, False], fill_value='?', dtype=object), 'param_weights': masked_array(data=['distance', 'uniform', 'distance', 'uniform', 'distance', 'uniform'], mask=[False, False, False, False, False, False], fill_value='?', dtype=object), 'params': [{'n_neighbors': 2, 'weights': 'distance'}, {'n_neighbors': 2, 'weights': 'uniform'}, {'n_neighbors': 3, 'weights': 'distance'}, {'n_neighbors': 3, 'weights': 'uniform'}, {'n_neighbors': 7, 'weights': 'distance'}, {'n_neighbors': 7, 'weights': 'uniform'}], 'rank_test_score': array([2, 1, 3, 4, 5, 6], dtype=int32), 'split0_test_score': array([0.6965, 0.706 , 0.656 , 0.654 , 0.645 , 0.6425]), 'split0_train_score': array([1. , 0.978, 1. , 0.737, 1. , 0.665]), 'split1_test_score': array([0.677 , 0.6915, 0.658 , 0.655 , 0.6455, 0.642 ]), 'split1_train_score': array([1. , 0.98175 , 1. , 0.733 , 1. , 0.660875]), 'split2_test_score': array([0.6865, 0.705 , 0.66 , 0.6595, 0.6425, 0.641 ]), 'split2_train_score': array([1. , 0.979375, 1. , 0.73025 , 1. , 0.659875]), 'split3_test_score': array([0.6785, 0.694 , 0.658 , 0.6545, 0.6445, 0.641 ]), 'split3_train_score': array([1. , 0.979875, 1. , 0.7395 , 1. , 0.6685 ]), 'split4_test_score': array([0.69 , 0.7095, 0.6605, 0.6575, 0.648 , 0.645 ]), 'split4_train_score': array([1. , 0.980625, 1. , 0.733125, 1. , 0.6595 ]), 'std_fit_time': array([0.00050485, 0.00067581, 0.00036578, 0.00051288, 0.00045831, 0.00051015]), 'std_score_time': array([0.06634425, 0.01184952, 0.07229286, 0.05769311, 0.01632403, 0.05215541]), 'std_test_score': array([0.00725672, 0.00710352, 0.00161245, 0.00208327, 0.001772 , 0.00146969]), 'std_train_score': array([0. , 0.0012515 , 0. , 0.00326879, 0. , 0.00347761])} ``` {:.input_area} ```python pd.DataFrame(grid.cv_results_) ```
mean_fit_time | mean_score_time | mean_test_score | mean_train_score | param_n_neighbors | param_weights | params | rank_test_score | split0_test_score | split0_train_score | ... | split2_test_score | split2_train_score | split3_test_score | split3_train_score | split4_test_score | split4_train_score | std_fit_time | std_score_time | std_test_score | std_train_score | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0.012431 | 0.625406 | 0.6857 | 1.000000 | 2 | distance | {'n_neighbors': 2, 'weights': 'distance'} | 2 | 0.6965 | 1.000 | ... | 0.6865 | 1.000000 | 0.6785 | 1.000000 | 0.6900 | 1.000000 | 0.000505 | 0.066344 | 0.007257 | 0.000000 |
1 | 0.011464 | 0.573168 | 0.7012 | 0.979925 | 2 | uniform | {'n_neighbors': 2, 'weights': 'uniform'} | 1 | 0.7060 | 0.978 | ... | 0.7050 | 0.979375 | 0.6940 | 0.979875 | 0.7095 | 0.980625 | 0.000676 | 0.011850 | 0.007104 | 0.001251 |
2 | 0.012320 | 0.727121 | 0.6585 | 1.000000 | 3 | distance | {'n_neighbors': 3, 'weights': 'distance'} | 3 | 0.6560 | 1.000 | ... | 0.6600 | 1.000000 | 0.6580 | 1.000000 | 0.6605 | 1.000000 | 0.000366 | 0.072293 | 0.001612 | 0.000000 |
3 | 0.012344 | 0.663401 | 0.6561 | 0.734575 | 3 | uniform | {'n_neighbors': 3, 'weights': 'uniform'} | 4 | 0.6540 | 0.737 | ... | 0.6595 | 0.730250 | 0.6545 | 0.739500 | 0.6575 | 0.733125 | 0.000513 | 0.057693 | 0.002083 | 0.003269 |
4 | 0.011344 | 0.746480 | 0.6451 | 1.000000 | 7 | distance | {'n_neighbors': 7, 'weights': 'distance'} | 5 | 0.6450 | 1.000 | ... | 0.6425 | 1.000000 | 0.6445 | 1.000000 | 0.6480 | 1.000000 | 0.000458 | 0.016324 | 0.001772 | 0.000000 |
5 | 0.012486 | 0.768845 | 0.6423 | 0.662750 | 7 | uniform | {'n_neighbors': 7, 'weights': 'uniform'} | 6 | 0.6425 | 0.665 | ... | 0.6410 | 0.659875 | 0.6410 | 0.668500 | 0.6450 | 0.659500 | 0.000510 | 0.052155 | 0.001470 | 0.003478 |
6 rows × 22 columns
Your turn
What is the optimal settings for k-nearest neighbor model? </div> {:.input_area} ```python from sklearn.pipeline import Pipeline from sklearn.model_selection import GridSearchCV pipeline = Pipeline([ ('vectorizer' , CountVectorizer()), ('classifier' , KNeighborsClassifier()) ]) parameters = {'vectorizer__max_features' : [300, 500, 700], 'classifier__n_neighbors' : [2,3, 5] } ``` {:.input_area} ```python grid = GridSearchCV(pipeline, parameters, n_jobs = -1, cv = 3, return_train_score = True, verbose = 1) ``` {:.input_area} ```python grid.fit(wine_df['description'], wine_df['rating']) ``` {:.output .output_stream} ``` Fitting 3 folds for each of 9 candidates, totalling 27 fits ``` {:.output .output_data_text} ``` GridSearchCV(cv=3, error_score='raise', estimator=Pipeline(memory=None, steps=[('vectorizer', CountVectorizer(analyzer='word', binary=False, decode_error='strict', dtype=<class 'numpy.int64'>, encoding='utf-8', input='content', lowercase=True, max_df=1.0, max_features=None, min_df=1, ngram_range=(1, 1), preprocessor=None, stop_words=None, ...owski', metric_params=None, n_jobs=1, n_neighbors=5, p=2, weights='uniform'))]), fit_params=None, iid=True, n_jobs=-1, param_grid={'vectorizer__max_features': [300, 500, 700], 'classifier__n_neighbors': [2, 3, 5]}, pre_dispatch='2*n_jobs', refit=True, return_train_score=True, scoring=None, verbose=1) ```
Your turn
How does this compare to a logistic regression model?
google sklearn logistic regression
</div>
{:.input_area}
```python
pd.DataFrame(grid.cv_results_)
```
mean_fit_time | mean_score_time | mean_test_score | mean_train_score | param_classifier__n_neighbors | param_vectorizer__max_features | params | rank_test_score | split0_test_score | split0_train_score | split1_test_score | split1_train_score | split2_test_score | split2_train_score | std_fit_time | std_score_time | std_test_score | std_train_score | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0.598258 | 2.685512 | 0.7243 | 0.952250 | 2 | 300 | {'classifier__n_neighbors': 2, 'vectorizer__max_features': 300} | 1 | 0.724355 | 0.949895 | 0.726673 | 0.954402 | 0.721872 | 0.952452 | 0.003394 | 0.018643 | 0.001960 | 0.001846 |
1 | 0.582437 | 2.546346 | 0.7215 | 0.965650 | 2 | 500 | {'classifier__n_neighbors': 2, 'vectorizer__max_features': 500} | 3 | 0.727355 | 0.959646 | 0.727573 | 0.968052 | 0.709571 | 0.969252 | 0.013307 | 0.115461 | 0.008435 | 0.004273 |
2 | 0.610783 | 2.521556 | 0.7236 | 0.969000 | 2 | 700 | {'classifier__n_neighbors': 2, 'vectorizer__max_features': 700} | 2 | 0.728554 | 0.963546 | 0.727573 | 0.969402 | 0.714671 | 0.974051 | 0.025366 | 0.010965 | 0.006326 | 0.004298 |
3 | 0.609263 | 2.527058 | 0.6987 | 0.803550 | 3 | 300 | {'classifier__n_neighbors': 3, 'vectorizer__max_features': 300} | 4 | 0.705459 | 0.805281 | 0.700270 | 0.800060 | 0.690369 | 0.805310 | 0.011740 | 0.004321 | 0.006260 | 0.002468 |
4 | 0.632821 | 2.604387 | 0.6847 | 0.782850 | 3 | 500 | {'classifier__n_neighbors': 3, 'vectorizer__max_features': 500} | 6 | 0.691662 | 0.787579 | 0.684668 | 0.781911 | 0.677768 | 0.779061 | 0.007089 | 0.036815 | 0.005672 | 0.003540 |
5 | 0.777215 | 2.901209 | 0.6807 | 0.778551 | 3 | 700 | {'classifier__n_neighbors': 3, 'vectorizer__max_features': 700} | 7 | 0.686563 | 0.789379 | 0.680168 | 0.775461 | 0.675368 | 0.770811 | 0.160461 | 0.205566 | 0.004586 | 0.007889 |
6 | 0.906246 | 3.479473 | 0.6947 | 0.756500 | 5 | 300 | {'classifier__n_neighbors': 5, 'vectorizer__max_features': 300} | 5 | 0.700960 | 0.757726 | 0.693669 | 0.750112 | 0.689469 | 0.761662 | 0.133283 | 0.092705 | 0.004748 | 0.004794 |
7 | 0.759941 | 3.473811 | 0.6774 | 0.736050 | 5 | 500 | {'classifier__n_neighbors': 5, 'vectorizer__max_features': 500} | 8 | 0.687163 | 0.744224 | 0.678068 | 0.732413 | 0.666967 | 0.731513 | 0.042536 | 0.118061 | 0.008259 | 0.005792 |
8 | 0.699978 | 2.871981 | 0.6760 | 0.726950 | 5 | 700 | {'classifier__n_neighbors': 5, 'vectorizer__max_features': 700} | 9 | 0.682364 | 0.736124 | 0.675368 | 0.719064 | 0.670267 | 0.725664 | 0.017384 | 0.161824 | 0.004959 | 0.007024 |
Your turn
As a group, take a look at the text of the wine descriptions. Ignore the ratings. What different themes do you find? </div>  {:.input_area} ```python wine_df['description'].values[250] ``` {:.output .output_data_text} ``` "There's a strong peach-fuzz aroma enveloping this wine. It's unoaked, tasting overly rich and oily." ``` {:.input_area} ```python from sklearn.decomposition import LatentDirichletAllocation ``` {:.input_area} ```python vectorizer = CountVectorizer(lowercase = True, ngram_range = (1,2), max_df = .50, min_df = .01, max_features = None) ``` {:.input_area} ```python vectorizer.fit(wine_df['description']) ``` {:.output .output_data_text} ``` CountVectorizer(analyzer='word', binary=False, decode_error='strict', dtype=<class 'numpy.int64'>, encoding='utf-8', input='content', lowercase=True, max_df=0.5, max_features=None, min_df=0.01, ngram_range=(1, 2), preprocessor=None, stop_words=None, strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, vocabulary=None) ``` {:.input_area} ```python review_word_counts = vectorizer.transform(wine_df['description']) ``` {:.input_area} ```python lda = LatentDirichletAllocation(n_components = 5) lda.fit(review_word_counts) ``` {:.output .output_data_text} ``` LatentDirichletAllocation(batch_size=128, doc_topic_prior=None, evaluate_every=-1, learning_decay=0.7, learning_method=None, learning_offset=10.0, max_doc_update_iter=100, max_iter=10, mean_change_tol=0.001, n_components=5, n_jobs=1, n_topics=None, perp_tol=0.1, random_state=None, topic_word_prior=None, total_samples=1000000.0, verbose=0) ``` {:.input_area} ```python lda.fit(review_word_counts) ``` {:.output .output_data_text} ``` LatentDirichletAllocation(batch_size=128, doc_topic_prior=None, evaluate_every=-1, learning_decay=0.7, learning_method=None, learning_offset=10.0, max_doc_update_iter=100, max_iter=10, mean_change_tol=0.001, n_components=5, n_jobs=1, n_topics=None, perp_tol=0.1, random_state=None, topic_word_prior=None, total_samples=1000000.0, verbose=0) ``` What words are associated with what topics? LatentDirichletAllocation is bad a showing results in a pretty way. {:.input_area} ```python def column_swap(column): column = column.sort_values(ascending = False) return column.index def topic_words_df(lda_model, vectorizer): ''' Generate dataframe of words associated with a topic model. ''' word_topic_scores = lda_model.components_.T vocabulary = vectorizer.get_feature_names() topic_words_df = pd.DataFrame(word_topic_scores, index = vocabulary) topic_words_df = topic_words_df.apply(column_swap).reset_index(drop = True).rename_axis('rank') topic_words_df.index = topic_words_df.index + 1 return topic_words_df ``` {:.input_area} ```python topic_words_df(lda, vectorizer).head(10) ```
0 | 1 | 2 | 3 | 4 | |
---|---|---|---|---|---|
rank | |||||
1 | flavors | wine | flavors | aromas | in |
2 | on | to | in | palate | on |
3 | palate | drink | wine | finish | black |
4 | apple | that | blend | on | oak |
5 | on the | from | cabernet | flavors | flavors |
6 | citrus | ripe | red | aromas of | to |
7 | acidity | acidity | soft | cherry | chocolate |
8 | finish | fruit | fruit | berry | wine |
9 | to | has | that | fruit | in the |
10 | wine | tannins | blend of | on the | on the |
Your turn
As a group, try different options for your vectorizer and number of topics. What set of parameters creates the most coherent topics? </div>
Your turn
What were the major themes in Donald Trump campaign speeches? </div> {:.input_area} ```python ge_df = pd.read_json('data/ge_speeches.json') ``` {:.input_area} ```python vectorizer = CountVectorizer(lowercase = True, ngram_range = (1,1), stop_words = ['lot', 'percent'], max_df = .60, min_df = .01, max_features = None) vectorizer.fit(ge_df['text']) ``` {:.output .output_data_text} ``` CountVectorizer(analyzer='word', binary=False, decode_error='strict', dtype=<class 'numpy.int64'>, encoding='utf-8', input='content', lowercase=True, max_df=0.6, max_features=None, min_df=0.01, ngram_range=(1, 1), preprocessor=None, stop_words=['lot', 'percent'], strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, vocabulary=None) ``` {:.input_area} ```python ge_tf = vectorizer.transform(ge_df['text']) ``` {:.input_area} ```python lda = LatentDirichletAllocation(n_components = 10) lda.fit(ge_tf) topic_words_df(lda, vectorizer).head(10) ```
0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | |
---|---|---|---|---|---|---|---|---|---|---|
rank | ||||||||||
1 | illegal | him | cyber | kids | isis | citizens | growth | cities | folks | folks |
2 | immigration | kids | review | him | terrorism | administration | trillion | regulations | doesn | bad |
3 | administration | friends | attacks | everybody | radical | illegal | income | follow | bad | isis |
4 | obamacare | each | justice | kind | islamic | immigration | childcare | donors | cities | doesn |
5 | folks | gwen | department | something | defense | high | regulations | policies | everybody | pennsylvania |
6 | mexico | black | defense | college | east | failed | savings | rebuild | isis | illegal |
7 | border | doesn | best | each | iraq | 10 | wealth | policy | failed | ago |
8 | cities | joyce | both | friends | war | kids | china | citizens | didn | ok |
9 | companies | kind | enforcement | still | policy | best | cost | interests | thing | story |
10 | borders | marian | crime | doesn | immigration | everybody | rate | deals | companies | companies |
0 | 1 | 2 | 3 | 4 | |
---|---|---|---|---|---|
0 | 0.018200 | 0.296524 | 0.111864 | 0.334735 | 0.238678 |
1 | 0.025000 | 0.025180 | 0.025001 | 0.025274 | 0.899546 |
2 | 0.050008 | 0.050000 | 0.050000 | 0.799678 | 0.050314 |
3 | 0.259258 | 0.020000 | 0.020168 | 0.680060 | 0.020514 |
4 | 0.022322 | 0.133351 | 0.022223 | 0.022982 | 0.799122 |
5 | 0.082002 | 0.012501 | 0.012516 | 0.012655 | 0.880326 |
6 | 0.016780 | 0.016670 | 0.016672 | 0.016750 | 0.933127 |
7 | 0.484423 | 0.025001 | 0.149840 | 0.315591 | 0.025144 |
8 | 0.113505 | 0.346898 | 0.012549 | 0.212557 | 0.314491 |
9 | 0.015924 | 0.015392 | 0.015385 | 0.937815 | 0.015484 |
Your turn
Using your best topic model, what is the prediction rate for your best k nearest neighbors model? </div> Let's do it again, but with a different data set {:.input_area} ```python bg_df = pd.read_csv('data/boardgames.csv') ``` {:.input_area} ```python bg_df.info() ``` {:.output .output_stream} ``` <class 'pandas.core.frame.DataFrame'> RangeIndex: 13720 entries, 0 to 13719 Data columns (total 58 columns): Unnamed: 0 13720 non-null int64 index 13720 non-null int64 name 13720 non-null object description 13720 non-null object max_players 13720 non-null float64 min_players 13720 non-null float64 min_playtime 13720 non-null float64 max_playtime 13720 non-null float64 min_age 13720 non-null float64 category 13720 non-null object mechanics 13720 non-null object year_published 13720 non-null float64 weighted_score 13720 non-null float64 number_of_comments 13720 non-null float64 complexity 13720 non-null float64 number_owned 13720 non-null float64 number_raters 13720 non-null float64 category_cardgame 13720 non-null bool category_wargame 13720 non-null bool category_fantasy 13720 non-null bool category_dice 13720 non-null bool category_partygame 13720 non-null bool category_fighting 13720 non-null bool category_sciencefiction 13720 non-null bool category_abstractstrategy 13720 non-null bool category_economic 13720 non-null bool category_childrensgame 13720 non-null bool category_worldwarii 13720 non-null bool category_bluffing 13720 non-null bool category_animals 13720 non-null bool category_humor 13720 non-null bool category_actiondexterity 13720 non-null bool category_adventure 13720 non-null bool category_moviestvradiotheme 13720 non-null bool category_medieval 13720 non-null bool category_deduction 13720 non-null bool category_miniatures 13720 non-null bool mechanic_dicerolling 13720 non-null bool mechanic_handmanagement 13720 non-null bool mechanic_hexandcounter 13720 non-null bool mechanic_setcollection 13720 non-null bool mechanic_variableplayerpowers 13720 non-null bool mechanic_none 13720 non-null bool mechanic_tileplacement 13720 non-null bool mechanic_modularboard 13720 non-null bool mechanic_carddrafting 13720 non-null bool mechanic_rollspinandmove 13720 non-null bool mechanic_areacontrolareainfluence 13720 non-null bool mechanic_auctionbidding 13720 non-null bool mechanic_simulation 13720 non-null bool mechanic_areamovement 13720 non-null bool mechanic_simultaneousactionselection 13720 non-null bool mechanic_actionpointallowancesystem 13720 non-null bool mechanic_cooperativeplay 13720 non-null bool mechanic_pointtopointmovement 13720 non-null bool mechanic_partnerships 13720 non-null bool mechanic_memory 13720 non-null bool quality_game 13720 non-null bool dtypes: bool(41), float64(11), int64(2), object(4) memory usage: 2.3+ MB ``` {:.input_area} ```python bg_df.head() ```
Your turn
Load up this dataset in your other workbook. Topic model the game descriptions. </div> {:.input_area} ```python from sklearn.feature_extraction.text import CountVectorizer ``` {:.input_area} ```python vectorizer = CountVectorizer(max_df=.6, min_df=.01, stop_words= 'english') ``` {:.input_area} ```python vectorizer.fit(bg_df['description']) ``` {:.output .output_data_text} ``` CountVectorizer(analyzer='word', binary=False, decode_error='strict', dtype=<class 'numpy.int64'>, encoding='utf-8', input='content', lowercase=True, max_df=0.6, max_features=None, min_df=0.01, ngram_range=(1, 1), preprocessor=None, stop_words='english', strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, vocabulary=None) ``` {:.input_area} ```python bg_wf = vectorizer.transform(bg_df['description']) ``` {:.input_area} ```python pd.DataFrame(bg_wf.todense(), columns=vectorizer.get_feature_names()).sum().sort_values().tail() ``` {:.output .output_data_text} ``` points 6660 play 8171 card 11868 cards 18295 player 20632 dtype: int64 ``` {:.input_area} ```python len(vectorizer.get_feature_names()) ``` {:.output .output_data_text} ``` 1350 ``` {:.input_area} ```python from sklearn.decomposition import LatentDirichletAllocation ``` {:.input_area} ```python lda = LatentDirichletAllocation(n_components = 10, n_jobs = -1, learning_method = 'online') ``` {:.input_area} ```python lda.fit(bg_wf) ``` {:.output .output_data_text} ``` LatentDirichletAllocation(batch_size=128, doc_topic_prior=None, evaluate_every=-1, learning_decay=0.7, learning_method='online', learning_offset=10.0, max_doc_update_iter=100, max_iter=10, mean_change_tol=0.001, n_components=10, n_jobs=-1, n_topics=None, perp_tol=0.1, random_state=None, topic_word_prior=None, total_samples=1000000.0, verbose=0) ``` {:.input_area} ```python topics = lda.transform(bg_wf) ``` {:.input_area} ```python topics ``` {:.output .output_data_text} ``` array([[0.03679355, 0.00128221, 0.16932467, ..., 0.00128258, 0.23042156, 0.21902049], [0.00166734, 0.04633801, 0.00166712, ..., 0.66111848, 0.00166688, 0.00166742], [0.13538142, 0.00178592, 0.0017863 , ..., 0.2467892 , 0.00178637, 0.00178612], ..., [0.00400069, 0.00400016, 0.13058438, ..., 0.00400077, 0.04660411, 0.00400005], [0.00108719, 0.05541306, 0.06481335, ..., 0.00108721, 0.19867116, 0.00108727], [0.20425653, 0.00175463, 0.38152976, ..., 0.00175504, 0.00175484, 0.043443 ]]) ``` {:.input_area} ```python pd.DataFrame(topics) ```
0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | |
---|---|---|---|---|---|---|---|---|---|---|
0 | 0.036794 | 0.001282 | 0.169325 | 0.001282 | 0.001282 | 0.338028 | 0.001282 | 0.001283 | 0.230422 | 0.219020 |
1 | 0.001667 | 0.046338 | 0.001667 | 0.184943 | 0.001667 | 0.097597 | 0.001667 | 0.661118 | 0.001667 | 0.001667 |
2 | 0.135381 | 0.001786 | 0.001786 | 0.055077 | 0.077460 | 0.476361 | 0.001786 | 0.246789 | 0.001786 | 0.001786 |
3 | 0.416140 | 0.003704 | 0.003705 | 0.003704 | 0.003704 | 0.274632 | 0.003704 | 0.003705 | 0.066949 | 0.220052 |
4 | 0.001924 | 0.001923 | 0.046196 | 0.001923 | 0.001923 | 0.538005 | 0.001923 | 0.001923 | 0.402336 | 0.001923 |
5 | 0.069201 | 0.003449 | 0.251228 | 0.003449 | 0.003449 | 0.273391 | 0.003449 | 0.003449 | 0.121061 | 0.267874 |
6 | 0.603030 | 0.002381 | 0.080042 | 0.002381 | 0.230618 | 0.072023 | 0.002381 | 0.002381 | 0.002381 | 0.002381 |
7 | 0.003705 | 0.003705 | 0.399030 | 0.065456 | 0.003705 | 0.509580 | 0.003705 | 0.003706 | 0.003705 | 0.003704 |
8 | 0.002000 | 0.002000 | 0.376853 | 0.002000 | 0.002001 | 0.607145 | 0.002000 | 0.002000 | 0.002000 | 0.002000 |
9 | 0.163626 | 0.000427 | 0.027896 | 0.000427 | 0.000427 | 0.368236 | 0.051747 | 0.386358 | 0.000427 | 0.000427 |
10 | 0.001786 | 0.041278 | 0.168536 | 0.001786 | 0.001786 | 0.001786 | 0.001786 | 0.256963 | 0.522507 | 0.001786 |
11 | 0.001352 | 0.001352 | 0.161444 | 0.001352 | 0.001352 | 0.802978 | 0.026117 | 0.001352 | 0.001352 | 0.001351 |
12 | 0.224039 | 0.023161 | 0.000568 | 0.131756 | 0.000568 | 0.312246 | 0.000568 | 0.131744 | 0.174781 | 0.000568 |
13 | 0.231017 | 0.001191 | 0.001191 | 0.001191 | 0.023955 | 0.594456 | 0.001191 | 0.143427 | 0.001191 | 0.001191 |
14 | 0.002565 | 0.002564 | 0.002565 | 0.613030 | 0.002565 | 0.002565 | 0.040237 | 0.002565 | 0.328778 | 0.002564 |
15 | 0.298661 | 0.158551 | 0.001667 | 0.001667 | 0.001667 | 0.425807 | 0.001667 | 0.054123 | 0.001667 | 0.054522 |
16 | 0.366926 | 0.003572 | 0.003573 | 0.003572 | 0.214681 | 0.003572 | 0.305771 | 0.003572 | 0.003572 | 0.091189 |
17 | 0.102983 | 0.000807 | 0.361239 | 0.034604 | 0.117067 | 0.054262 | 0.000807 | 0.290779 | 0.036645 | 0.000807 |
18 | 0.131924 | 0.000820 | 0.601042 | 0.000820 | 0.000820 | 0.110248 | 0.011614 | 0.000820 | 0.000820 | 0.141073 |
19 | 0.260034 | 0.001961 | 0.001961 | 0.001961 | 0.332711 | 0.090885 | 0.171884 | 0.049259 | 0.087382 | 0.001961 |
20 | 0.031763 | 0.095154 | 0.317997 | 0.000769 | 0.000769 | 0.089795 | 0.363318 | 0.000769 | 0.098896 | 0.000769 |
21 | 0.006669 | 0.006669 | 0.006668 | 0.006667 | 0.006671 | 0.006669 | 0.808175 | 0.006670 | 0.138474 | 0.006668 |
22 | 0.192892 | 0.003572 | 0.003573 | 0.003573 | 0.333492 | 0.108656 | 0.003573 | 0.003572 | 0.343526 | 0.003572 |
23 | 0.005003 | 0.005002 | 0.093796 | 0.091570 | 0.241970 | 0.205297 | 0.299664 | 0.047696 | 0.005002 | 0.005002 |
24 | 0.002000 | 0.002001 | 0.127625 | 0.002000 | 0.072620 | 0.364125 | 0.002001 | 0.367634 | 0.057993 | 0.002001 |
25 | 0.004763 | 0.004762 | 0.004764 | 0.004762 | 0.710556 | 0.004763 | 0.111830 | 0.004763 | 0.004763 | 0.144274 |
26 | 0.110387 | 0.003572 | 0.286325 | 0.003572 | 0.003572 | 0.065960 | 0.003572 | 0.049891 | 0.469576 | 0.003572 |
27 | 0.427984 | 0.006668 | 0.006669 | 0.006667 | 0.151561 | 0.006669 | 0.373775 | 0.006668 | 0.006671 | 0.006667 |
28 | 0.277243 | 0.001755 | 0.104799 | 0.250019 | 0.001755 | 0.038028 | 0.321137 | 0.001755 | 0.001756 | 0.001755 |
29 | 0.003227 | 0.003226 | 0.003227 | 0.003226 | 0.088290 | 0.003226 | 0.885896 | 0.003228 | 0.003227 | 0.003227 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
13690 | 0.001389 | 0.147356 | 0.102580 | 0.001389 | 0.156851 | 0.036481 | 0.001389 | 0.185860 | 0.253403 | 0.113301 |
13691 | 0.333421 | 0.000827 | 0.072009 | 0.000827 | 0.000827 | 0.485963 | 0.000827 | 0.103647 | 0.000827 | 0.000826 |
13692 | 0.196621 | 0.002632 | 0.169927 | 0.050161 | 0.109173 | 0.002633 | 0.204476 | 0.002633 | 0.259113 | 0.002632 |
13693 | 0.407877 | 0.001087 | 0.212051 | 0.001087 | 0.001087 | 0.263763 | 0.001087 | 0.109786 | 0.001087 | 0.001087 |
13694 | 0.001613 | 0.300385 | 0.080069 | 0.179955 | 0.245212 | 0.001613 | 0.001613 | 0.001613 | 0.165463 | 0.022463 |
13695 | 0.002439 | 0.002439 | 0.182178 | 0.068554 | 0.092012 | 0.002440 | 0.301052 | 0.156707 | 0.189739 | 0.002439 |
13696 | 0.240402 | 0.003846 | 0.296007 | 0.003847 | 0.003847 | 0.301290 | 0.139221 | 0.003847 | 0.003847 | 0.003847 |
13697 | 0.114267 | 0.002128 | 0.055188 | 0.002128 | 0.028766 | 0.002128 | 0.002128 | 0.733954 | 0.002128 | 0.057186 |
13698 | 0.001695 | 0.001695 | 0.318692 | 0.001695 | 0.001695 | 0.667746 | 0.001695 | 0.001696 | 0.001695 | 0.001696 |
13699 | 0.002942 | 0.002942 | 0.220434 | 0.002942 | 0.273285 | 0.138406 | 0.350223 | 0.002942 | 0.002942 | 0.002942 |
13700 | 0.002273 | 0.002273 | 0.208484 | 0.028788 | 0.002273 | 0.002273 | 0.291414 | 0.211473 | 0.248476 | 0.002273 |
13701 | 0.002084 | 0.002084 | 0.309668 | 0.002083 | 0.002084 | 0.002084 | 0.321541 | 0.187172 | 0.169118 | 0.002084 |
13702 | 0.000794 | 0.000794 | 0.297327 | 0.104246 | 0.000794 | 0.154357 | 0.000794 | 0.418542 | 0.021558 | 0.000794 |
13703 | 0.002326 | 0.002326 | 0.128596 | 0.051862 | 0.465273 | 0.340313 | 0.002326 | 0.002326 | 0.002326 | 0.002326 |
13704 | 0.140096 | 0.004762 | 0.488923 | 0.004762 | 0.004763 | 0.126639 | 0.004763 | 0.215765 | 0.004763 | 0.004763 |
13705 | 0.003226 | 0.003227 | 0.409906 | 0.084880 | 0.003226 | 0.003227 | 0.162181 | 0.287394 | 0.003226 | 0.039507 |
13706 | 0.001282 | 0.398421 | 0.251342 | 0.154234 | 0.001282 | 0.001282 | 0.057943 | 0.001283 | 0.093845 | 0.039084 |
13707 | 0.131354 | 0.003846 | 0.003847 | 0.003846 | 0.003847 | 0.003848 | 0.003847 | 0.241739 | 0.599979 | 0.003847 |
13708 | 0.206511 | 0.071463 | 0.000550 | 0.000550 | 0.073339 | 0.131220 | 0.214766 | 0.300503 | 0.000550 | 0.000549 |
13709 | 0.607212 | 0.003449 | 0.238582 | 0.003449 | 0.003450 | 0.130061 | 0.003449 | 0.003449 | 0.003450 | 0.003449 |
13710 | 0.141811 | 0.003448 | 0.747494 | 0.086553 | 0.003449 | 0.003449 | 0.003449 | 0.003450 | 0.003449 | 0.003449 |
13711 | 0.005884 | 0.005883 | 0.251928 | 0.005883 | 0.005883 | 0.005884 | 0.005883 | 0.544371 | 0.162518 | 0.005883 |
13712 | 0.002273 | 0.002273 | 0.317546 | 0.002274 | 0.141929 | 0.286050 | 0.157851 | 0.050934 | 0.002273 | 0.036597 |
13713 | 0.005557 | 0.005556 | 0.590524 | 0.005557 | 0.005558 | 0.242133 | 0.005557 | 0.005558 | 0.005556 | 0.128443 |
13714 | 0.615493 | 0.088824 | 0.001111 | 0.001111 | 0.001111 | 0.001112 | 0.001111 | 0.287903 | 0.001112 | 0.001111 |
13715 | 0.921552 | 0.000807 | 0.000807 | 0.000807 | 0.000806 | 0.000807 | 0.000806 | 0.071996 | 0.000807 | 0.000806 |
13716 | 0.404989 | 0.003031 | 0.003031 | 0.003030 | 0.003030 | 0.003031 | 0.084891 | 0.488905 | 0.003031 | 0.003030 |
13717 | 0.004001 | 0.004000 | 0.130584 | 0.175442 | 0.004001 | 0.623367 | 0.004000 | 0.004001 | 0.046604 | 0.004000 |
13718 | 0.001087 | 0.055413 | 0.064813 | 0.001087 | 0.001087 | 0.198599 | 0.477068 | 0.001087 | 0.198671 | 0.001087 |
13719 | 0.204257 | 0.001755 | 0.381530 | 0.001755 | 0.001755 | 0.360242 | 0.001755 | 0.001755 | 0.001755 | 0.043443 |
13720 rows × 10 columns
0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | |
---|---|---|---|---|---|---|---|---|---|---|
rank | ||||||||||
1 | player | rules | time | battle | war | player | characters | cards | games | french |
2 | dice | campaign | play | attack | units | points | character | card | rules | british |
3 | board | counters | make | victory | combat | city | heroes | player | new | north |
4 | tiles | german | win | army | player | building | treasure | play | edition | american |
5 | pieces | battle | ll | fight | map | end | monsters | hand | play | 000 |
6 | tile | scale | best | master | forces | turn | dragon | deck | set | town |
7 | die | scenarios | way | mission | world | build | adventure | points | version | strength |
8 | roll | map | just | enemy | unit | actions | dungeon | round | original | south |
9 | turn | army | race | power | movement | money | magic | turn | includes | save |
10 | points | scenario | need | battles | command | new | hero | played | box | river |
0 | |
---|---|
max_players | 0.241430 |
min_players | 0.108830 |
min_playtime | 0.203080 |
max_playtime | 0.237414 |
min_age | 0.209246 |
0 | |
---|---|
category_childrensgame | 0.003584 |
category_actiondexterity | 0.004004 |
category_partygame | 0.004981 |
category_worldwarii | 0.005507 |
category_moviestvradiotheme | 0.006864 |
category_abstractstrategy | 0.007163 |
category_humor | 0.007265 |
mechanic_cooperativeplay | 0.007869 |
mechanic_simulation | 0.007869 |
category_miniatures | 0.008260 |
category_dice | 0.008619 |
mechanic_memory | 0.008779 |
category_deduction | 0.008781 |
mechanic_rollspinandmove | 0.008850 |
category_adventure | 0.009006 |
category_animals | 0.009293 |
mechanic_areamovement | 0.009405 |
category_bluffing | 0.010340 |
category_sciencefiction | 0.010505 |
mechanic_actionpointallowancesystem | 0.011711 |
category_medieval | 0.011962 |
category_fighting | 0.012082 |
mechanic_auctionbidding | 0.012168 |
category_fantasy | 0.012281 |
mechanic_none | 0.012621 |
mechanic_pointtopointmovement | 0.013348 |
category_cardgame | 0.013492 |
mechanic_partnerships | 0.013829 |
mechanic_tileplacement | 0.013831 |
mechanic_simultaneousactionselection | 0.014041 |
mechanic_hexandcounter | 0.016242 |
mechanic_modularboard | 0.017134 |
category_wargame | 0.017296 |
mechanic_dicerolling | 0.017726 |
category_economic | 0.018247 |
mechanic_setcollection | 0.019286 |
mechanic_variableplayerpowers | 0.024662 |
mechanic_carddrafting | 0.026967 |
mechanic_areacontrolareainfluence | 0.028610 |
min_players | 0.029299 |
mechanic_handmanagement | 0.033756 |
max_players | 0.059831 |
min_playtime | 0.064849 |
min_age | 0.067872 |
max_playtime | 0.077706 |
complexity | 0.192207 |
mean_fit_time | mean_score_time | mean_test_score | mean_train_score | param_class_weight | param_max_features | param_min_samples_split | param_n_estimators | params | rank_test_score | ... | split2_test_score | split2_train_score | split3_test_score | split3_train_score | split4_test_score | split4_train_score | std_fit_time | std_score_time | std_test_score | std_train_score | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0.035971 | 0.003265 | 0.692347 | 0.766254 | balanced | 4 | 10 | 3 | {'class_weight': 'balanced', 'max_features': 4, 'min_samples_split': 10, 'n_estimators': 3} | 12 | ... | 0.745262 | 0.749271 | 0.690849 | 0.765510 | 0.597886 | 0.769518 | 0.002877 | 0.000604 | 0.060190 | 0.009190 |
1 | 0.044662 | 0.003466 | 0.698324 | 0.783473 | balanced | 4 | 10 | 5 | {'class_weight': 'balanced', 'max_features': 4, 'min_samples_split': 10, 'n_estimators': 5} | 11 | ... | 0.763120 | 0.775966 | 0.705432 | 0.783912 | 0.612104 | 0.773982 | 0.004239 | 0.000408 | 0.051645 | 0.007626 |
2 | 0.083042 | 0.005658 | 0.712245 | 0.789450 | balanced | 4 | 10 | 10 | {'class_weight': 'balanced', 'max_features': 4, 'min_samples_split': 10, 'n_estimators': 10} | 9 | ... | 0.770408 | 0.791181 | 0.711994 | 0.788922 | 0.617572 | 0.785461 | 0.001559 | 0.000153 | 0.056820 | 0.003434 |
3 | 0.127855 | 0.008309 | 0.716399 | 0.800784 | balanced | 4 | 10 | 15 | {'class_weight': 'balanced', 'max_features': 4, 'min_samples_split': 10, 'n_estimators': 15} | 7 | ... | 0.765671 | 0.794916 | 0.723296 | 0.800401 | 0.622676 | 0.801221 | 0.006741 | 0.000515 | 0.054566 | 0.005317 |
4 | 0.202073 | 0.013132 | 0.711953 | 0.795536 | balanced | 4 | 10 | 25 | {'class_weight': 'balanced', 'max_features': 4, 'min_samples_split': 10, 'n_estimators': 25} | 10 | ... | 0.763848 | 0.788448 | 0.720744 | 0.795117 | 0.608458 | 0.791382 | 0.002880 | 0.000554 | 0.056656 | 0.007897 |
5 | 0.426163 | 0.029572 | 0.715889 | 0.800091 | balanced | 4 | 10 | 50 | {'class_weight': 'balanced', 'max_features': 4, 'min_samples_split': 10, 'n_estimators': 50} | 8 | ... | 0.777697 | 0.792912 | 0.726577 | 0.804409 | 0.609552 | 0.797030 | 0.012709 | 0.002753 | 0.059357 | 0.008176 |
6 | 0.032092 | 0.002788 | 0.827551 | 0.875638 | None | 4 | 10 | 3 | {'class_weight': None, 'max_features': 4, 'min_samples_split': 10, 'n_estimators': 3} | 6 | ... | 0.860423 | 0.872540 | 0.840685 | 0.873098 | 0.741524 | 0.878564 | 0.001444 | 0.000186 | 0.043707 | 0.002381 |
7 | 0.065479 | 0.005214 | 0.828717 | 0.878280 | None | 4 | 10 | 5 | {'class_weight': None, 'max_features': 4, 'min_samples_split': 10, 'n_estimators': 5} | 5 | ... | 0.861152 | 0.874453 | 0.841415 | 0.877471 | 0.748451 | 0.882664 | 0.001335 | 0.000455 | 0.040815 | 0.002629 |
8 | 0.176274 | 0.010163 | 0.838557 | 0.881013 | None | 4 | 10 | 10 | {'class_weight': None, 'max_features': 4, 'min_samples_split': 10, 'n_estimators': 10} | 1 | ... | 0.863338 | 0.878007 | 0.849435 | 0.879840 | 0.773970 | 0.885215 | 0.056332 | 0.001918 | 0.032840 | 0.002381 |
9 | 0.220291 | 0.013547 | 0.834913 | 0.881378 | None | 4 | 10 | 15 | {'class_weight': None, 'max_features': 4, 'min_samples_split': 10, 'n_estimators': 15} | 4 | ... | 0.864431 | 0.878371 | 0.840321 | 0.879111 | 0.764127 | 0.884941 | 0.026049 | 0.001387 | 0.036385 | 0.002402 |
10 | 0.290428 | 0.020428 | 0.836152 | 0.881651 | None | 4 | 10 | 25 | {'class_weight': None, 'max_features': 4, 'min_samples_split': 10, 'n_estimators': 25} | 3 | ... | 0.863703 | 0.878098 | 0.845425 | 0.879384 | 0.767043 | 0.885123 | 0.041821 | 0.001880 | 0.035300 | 0.002584 |
11 | 0.487544 | 0.026016 | 0.836662 | 0.882507 | None | 4 | 10 | 50 | {'class_weight': None, 'max_features': 4, 'min_samples_split': 10, 'n_estimators': 50} | 2 | ... | 0.865160 | 0.879464 | 0.846518 | 0.880113 | 0.765950 | 0.885579 | 0.074376 | 0.001378 | 0.035935 | 0.002363 |
12 rows × 24 columns
Your super big challenge
You want to make a quality game. Based on this dataset, what sort of game should you make? Use a random forest model to find the best set up parameters.
Bonus challenge: Use both features in the data set and ones you construct from a topic model! </div> {:.input_area} ```python rf_prediction = rf_best.predict_proba(bg_df[x_names]) ``` {:.input_area} ```python from sklearn.calibration import calibration_curve def calplot(y_observed, y_predicted): rf_y, rf_x = calibration_curve(y_observed, y_predicted[:,1], n_bins=10) pd.DataFrame([rf_x , rf_y]).T.plot.scatter(x=0, y=1, figsize = (5,5)) ``` {:.input_area} ```python calplot(bg_df['quality_game'], rf_prediction) ``` {:.input_area} ```python idf = pd.Series(rf_best.feature_importances_, index = x_names) idf.sort_values() ``` {:.input_area} ```python idf.sort_values().plot(kind='barh', ) ``` {:.input_area} ```python bg_df.keys() ```