"""
Author: 
Date: 
Description:

"""

from collections import Counter
import os
import matplotlib.pyplot as plt

def get_words(s,do_lower=True):
	if do_lower:
		return s.lower().split()
	return s.split()

def count_words(words):
	return Counter(words)

def words_by_frequency(words,n=0):
	counts = count_words(words)
	s = sorted(counts.items(),key=lambda x:x[1],reverse=True)
	return s[:n] if n else s

def plot_frequency(tokens):
	keys = [t[0] for t in tokens]
	values = [t[1] for t in tokens]
	plt.bar(keys,values)
	plt.xlabel("Words")
	plt.xticks(rotation=90)
	plt.xticks(fontsize=6)
	plt.ylabel("Frequency counts")
	plt.title("Words by frequency")
	plt.xticks(rotation=90)
	plt.show()

def plot_frequency_by_length(tokens):
	tokens = sorted(tokens,key=lambda x:len(x[0]))
	keys = [(t[0]) for t in tokens]
	values = [t[1] for t in tokens]
	plt.scatter(keys,values)
	plt.xlabel("Words")
	plt.xticks(rotation = 90)
	plt.xticks(fontsize=6)
	plt.ylabel("Frequency counts")
	plt.title('Words by frequency in Project Gutenberg')
	plt.show()

def main():
	fpath = 'gutenberg_data'
	for fname in os.listdir(fpath):
		if fname.endswith('.txt'):
			with open(os.path.join(fpath,fname)) as fp:
				print(fname)
				text = fp.read()
				words = get_words(text)
				counts = words_by_frequency(words,100)
				plot_frequency(counts)
				#plot_frequency_by_length(counts)


if __name__ == '__main__':
	main()