import pandas as pd
import nltk
'punkt')
nltk.download(
from nltk.tokenize import sent_tokenize
[nltk_data] Downloading package punkt to /Users/nealcaren/nltk_data...
[nltk_data] Package punkt is already up-to-date!
February 21, 2024
[nltk_data] Downloading package punkt to /Users/nealcaren/nltk_data...
[nltk_data] Package punkt is already up-to-date!
Load a CSV file I created from a Scopus search for specific sociology journals over the last decade.
df = pd.read_json('https://raw.githubusercontent.com/nealcaren/notes/main/posts/abstracts/sociology-abstracts.json')
df['Year'].value_counts()
Year
2018 1076
2017 1068
2019 1033
2015 1031
2016 989
2021 983
2022 972
2020 932
2014 870
2023 843
Name: count, dtype: int64
Source title
Demography 890
Journal of Marriage and Family 757
Social Science Research 672
Social Forces 594
Sociological Forum 503
Sociological Perspectives 455
American Sociological Review 417
Social Networks 407
Sociological Methods and Research 381
Social Problems 366
Gender and Society 326
American Journal of Sociology 324
Journal of Health and Social Behavior 314
Sociological Inquiry 310
Sociology of Race and Ethnicity 289
Theory and Society 287
Sociological Science 271
City and Community 260
Social Currents 255
Symbolic Interaction 244
Qualitative Sociology 229
Mobilization 207
Social Psychology Quarterly 201
Poetics 183
Du Bois Review 182
Sociology of Education 171
Sociological Theory 160
Work and Occupations 142
Name: count, dtype: int64
df['abstract_first_sentence_punctuation'] = df['abstract_first_sentence'].str[-1]
df['abstract_first_sentence_punctuation'].value_counts()
abstract_first_sentence_punctuation
. 9290
? 480
" 24
) 2
! 1
Name: count, dtype: int64
# Okay, who used the exclamation mark?!!!
screen = df['abstract_first_sentence_punctuation']=="!"
df[screen]
DOI | Source title | Year | Authors | Title | Abstract | abstract_first_sentence | abstract_first_sentence_punctuation | |
---|---|---|---|---|---|---|---|---|
4178 | 10.1086/700831 | American Journal of Sociology | 2018 | Padgett J.F. | Faulkner’s assembly of memories into history: ... | In Absalom, Absalom! William Faulkner develops... | In Absalom, Absalom! | ! |
df['abstract_starts_with_a_question'] = df['abstract_first_sentence_punctuation'] == '?'
df['abstract_starts_with_a_question'].mean()
0.04899459018066755
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.ticker as mtick
# Set the Seaborn theme
sns.set_theme(style="white")
# Create the plot with pandas, now styled by Seaborn
ax = pd.crosstab(df['Source title'],
df['abstract_starts_with_a_question'], normalize='index').sort_values(by=True)[True].plot(kind='barh', figsize=(6, 6))
# Add a title and possibly adjust other aspects like labels
plt.title('What percent of journal abstracts start with a question?')
plt.xlabel('')
plt.ylabel('')
# Format the x-axis as percentages
ax.xaxis.set_major_formatter(mtick.PercentFormatter(1.0, decimals=0))
# Remove the top and right borders
sns.despine()
# Show the plot
plt.show()
df['abstract_first_word'] = df['abstract_first_sentence'].str.split().str[0]
df[df['abstract_starts_with_a_question']]['abstract_first_word'].value_counts()[:10]
abstract_first_word
How 163
What 65
Why 59
Does 31
Do 28
When 16
Is 15
Are 12
Can 10
This 7
Name: count, dtype: int64
# Step 1: Prepare the Data
# Categorize 'abstract_first_word' into "How", "What", "Why", and "Other"
dfa['category'] = dfa['abstract_first_word'].apply(lambda x: x if x in ['How', 'What', 'Why'] else 'Other')
# Create a crosstab/pivot table for the count or proportion of each category by 'Source title'
data_for_plot = pd.crosstab(dfa['Source title'], dfa['category'], normalize='index').sort_values(by='Why')
# Step 2: Plot the Stacked Bar Plot
# Set the Seaborn theme
sns.set_theme(style="white")
# Plot
ax = data_for_plot.plot(kind='barh', stacked=True, figsize=(10, 6))
# Add a title and adjust labels
plt.xlabel('Percent')
plt.ylabel('Source title')
# Format the x-axis as percentages
ax.xaxis.set_major_formatter(mtick.PercentFormatter(xmax=1.0, decimals=0))
# Remove the top and right borders
sns.despine()
ax.legend(loc='upper center', bbox_to_anchor=(0.5, 1.1), ncol=4)
# Show the plot
plt.show()