Neal Caren - University of North Carolina, Chapel Hill web twitter scholar
After reading 11 Amazing Outfits Ronald Reagan Rocked today, I started to wonder about the distribution of numbers in Buzzfeed headlines. I wrote up a quick Python script to analyze which numbers were used most frequently based on the last 1,861 article headlines tweeted. The full code and results are below, but:
from __future__ import division
from collections import Counter
import twython
#You can get your own from http://dev.twitter.com
consumer_key='your_consumer_key'
consumer_secret='your_consumer_secret'
access_token='your_access_token'
access_token_secret='your_access_token_secret'
#authenticate yourself
twitter = twython.Twython(consumer_key, consumer_secret, access_token, access_token_secret)
#get some tweets
headlines = []
last_id = 907718359241744384 # a random large number
#Grab the most recent tweets from the buzzfeed feed, ignoring retweets
for i in range(1,17):
tweets = twitter.get_user_timeline(screen_name = 'buzzfeed',
include_rts ='False',
count = 200 ,
max_id = last_id)
headlines = headlines + [tweet['text'] for tweet in tweets]
last_id = tweets[-1]['id'] - 1
for headline in headlines[:15]:
print headline
#remove duplicate headlines
headlines = list(set(headlines))
print len(headlines),'headlines'
#turn the headlines into a list of words
headline_words = [headline.split() for headline in headlines]
headline_words = [word for headline in headline_words for word in headline]
print len(headline_words),'headline words'
#keep only the numbers in the headlines
numbers = [word for word in headline_words if word.isdigit()]
print len(numbers),'numbers used'
#look for whether the headline has any number
any_number = [headline for headline in headlines if len([h for h in headline.split() if h.isdigit()==True])>0]
print '%02d%% any number' % (len(any_number)/len(headlines)*100)
#then look to see if the first word is a number
numbers_first = [headline for headline in headlines if headline.split()[0].isdigit()==True]
print '%02d%% number first' % (len(numbers_first)/len(any_number)*100)
#see if it's an odd numbers
odd_numbers = [number for number in numbers if int(number) % 2 ==1 ]
print '%02d%% odd first' % (len(odd_numbers)/len(numbers)*100)
#get the frequency of the number
number_counter = Counter(numbers)
#Print out the most common numbers
print ' No. Freq'
for number, count in number_counter.most_common()[:23]:
print '%4d %4d' % (int(number), count)
#print out the headlines with the most common number
for headline in headlines:
if '23' in headline.split():
print headline
#import tools for graphing
%matplotlib inline
import pandas as pd
#make a histogram for numbers smaller than 40
small_numbers = [int(number) for number in numbers if int(number) <= 40 ]
df = pd.DataFrame(small_numbers, columns =['Buzzfeed Headline Numbers'])
df.hist(bins=40)
#ignore code below. Imports style sheet for this page.
from IPython.core.display import HTML
def css_styling():
styles = open("custom.css", "r").read()
return HTML(styles)
css_styling()