"""
Author: 
Date: 
Description:

"""

import sys
import json
import math
import random
from numpy import argmax
from collections import Counter
from spacy.tokenizer import Tokenizer
from spacy.lang.en import English

nlp = English()
tokenizer = nlp.tokenizer

def load_data(fn):
	"""Loads data from file fn"""
	books = []
	with open(fn,'r') as of:
		for line in of:
			book = json.loads(line)
			books.append(book)
	return books

def sort_and_filter_books(books,genres):
	"""Filters out any books that do not belong to a genre in genres. Returns a dictionary keyed by genre."""
	books_by_genre = {}
	for b in books:
		genre = b["genre"]
		if genre in genres:
			if genre not in books_by_genre:
				books_by_genre[genre] = []
			books_by_genre[genre] += [b]
	return books_by_genre

def tokenize(s):
	"""Tokenizes text"""
	text = tokenizer(s)
	return [t.text for t in text]

def make_genre_sums(genre_counts,vocab):
	genre_sums = {}
	for genre,counter in genre_counts.items():
		genre_sums[genre] = sum([v for k,v in counter.items() if k in vocab])
	return genre_sums


def main():
	train_fn = "goodreads_US_17sample.json"
	test_fn = "goodreads_US_19sample.json"
	genres = ["young_adult","mystery_thriller_crime","romance","fantasy_paranormal"]
	books = load_data(train_fn)
	sorted_books = sort_and_filter_books(books,genres)
	print(sorted_books)

if __name__ == '__main__':
	main()