Word Frequency

This notebook finds the word frequencies for a dataset.

Research Notebook: Exploring Word Frequencies for Research

Explore word frequency of your own extracted data

Create a bar chart for the 20 most frequently used words

import matplotlib.pyplot as plt 

a = transformed_word_frequency.most_common(20)
bar_values = list(list(zip(*a)))

x_val = list(bar_values[0])
y_val = list(bar_values[1])

plt.figure(figsize=(12,8))    #Customize plot size
plt.barh(x_val, y_val, color='blue',height=0.3)
plt.xlabel("Word Counts")
plt.gca().invert_yaxis()

Create a wordcloud chart for the extracted text data

Modify 4 Find Word Frequencies by:

#4 Find Word Frequencies
word_str = " "

# from collections import Counter

# # Hold our word counts in a Counter Object
# transformed_word_frequency = Counter()

# # Apply filter list
# for document in tdm_client.dataset_reader(dataset_file):
#     if use_filtered_list is True:
#         document_id = document['id']
#         # Skip documents not in our filtered_id_list
#         if document_id not in filtered_id_list:
#             continue
#     unigrams = document.get("unigramCount", [])
#     for gram, count in unigrams.items():
#         clean_gram = gram.lower() # Lowercase the unigram
        word_str += " " + clean_gram  #Added: string of all words
#         if clean_gram in stop_words: # Remove unigrams from stop words
#             continue
#         if not clean_gram.isalpha(): # Remove unigrams that are not alphanumeric
#             continue
#         transformed_word_frequency[clean_gram] += count

#Install wordcloud
pip install wordcloud

#Install matplotlib for word plot cloud
from wordcloud import WordCloud, STOPWORDS 
import matplotlib.pyplot as plt

#Added: plot word cloud
wordcloud = WordCloud(width = 800, height = 800, 
                background_color ='white', 
                stopwords = stop_words, 
                min_font_size = 10).generate(word_str) 
  
# plot the WordCloud image                        
plt.figure(figsize = (8, 8), facecolor = None) 
plt.imshow(wordcloud) 
plt.axis("off") 
plt.tight_layout(pad = 0) 
plt.show()

PreviousCreate A Stopwords List NextDigital Scholarship Incubator

Last updated 4 years ago

Was this helpful?