Word Frequency
This notebook finds the word frequencies for a dataset.
Last updated
Was this helpful?
This notebook finds the word frequencies for a dataset.
Last updated
Was this helpful?
import matplotlib.pyplot as plt
a = transformed_word_frequency.most_common(20)
bar_values = list(list(zip(*a)))
x_val = list(bar_values[0])
y_val = list(bar_values[1])
plt.figure(figsize=(12,8)) #Customize plot size
plt.barh(x_val, y_val, color='blue',height=0.3)
plt.xlabel("Word Counts")
plt.gca().invert_yaxis()
#4 Find Word Frequencies
word_str = " "
# from collections import Counter
# # Hold our word counts in a Counter Object
# transformed_word_frequency = Counter()
# # Apply filter list
# for document in tdm_client.dataset_reader(dataset_file):
# if use_filtered_list is True:
# document_id = document['id']
# # Skip documents not in our filtered_id_list
# if document_id not in filtered_id_list:
# continue
# unigrams = document.get("unigramCount", [])
# for gram, count in unigrams.items():
# clean_gram = gram.lower() # Lowercase the unigram
word_str += " " + clean_gram #Added: string of all words
# if clean_gram in stop_words: # Remove unigrams from stop words
# continue
# if not clean_gram.isalpha(): # Remove unigrams that are not alphanumeric
# continue
# transformed_word_frequency[clean_gram] += count
#Install wordcloud
pip install wordcloud
#Install matplotlib for word plot cloud
from wordcloud import WordCloud, STOPWORDS
import matplotlib.pyplot as plt
#Added: plot word cloud
wordcloud = WordCloud(width = 800, height = 800,
background_color ='white',
stopwords = stop_words,
min_font_size = 10).generate(word_str)
# plot the WordCloud image
plt.figure(figsize = (8, 8), facecolor = None)
plt.imshow(wordcloud)
plt.axis("off")
plt.tight_layout(pad = 0)
plt.show()