# getBrowserSearches.py
# Author: Eni Mustafaraj
# Date: Nov 12, 2017
# Purpose: Extract a portion of the browser history belonging to a class 
# activity in CS 234, on Friday, Nov 10, 2017.

import pandas as pd
import sqlite3

# HISTORY FILE
# IMPORTANT: put your USERNAME instead of 'emustafa'. In a Mac computer, the 
# username is shown in the upper-right corner, next to the date, or in a terminal 
# window is the name closest to the command prompt. Don't confuse it with your 
# Machine Name. The user name shouldn't have spaces in it.
pathToHistory = "/Users/emustafa/Library/Application Support/Google/Chrome/Default/History"

# ALTERNATIVE 
"""
Go to chrome://history/ in your browser, find the file History, download it and
then use the path to that file. 
"""

############################## HELPER FUNCTIONS ################################

import datetime
from dateutil import tz

def convertChromeTime(ms):
    """Convert the amount of microsends into a datetime object. 
    Google chrome doesn't use Unix epoch.
    """
    return datetime.datetime(1601, 1, 1) + datetime.timedelta(microseconds=ms)


def convertLocalTime(datetimeObj):
    """Convert a datetime object into the local datetime."""
    # 1. find timezones
    fromZone = tz.gettz('UTC')
    toZone = tz.gettz('America/New_York')
    
    # 2. assign the UTC timezone to original datetime object, because it's naive
    dtUTC = datetimeObj.replace(tzinfo=fromZone)

    # 3. convert to local time
    return dtUTC.astimezone(toZone)
    
    
def convertToChromeTime(dateObj):
    """Convert a datetime object to microseconds that show Google Chrome time."""
    chromeEpoch = datetime.datetime(1601, 1, 1)
    return int((dateObj - chromeEpoch).total_seconds())*1000000
    
    
################################################################################
# Step 1: Connect to the History file; create a cursor

conn = sqlite3.connect(pathToHistory)
cursor = conn.cursor()

# Step 2: Read data from urls and visits into a dataframe

query = "SELECT urls.url, urls.title, visit_time, visit_duration, from_visit," + \
    "transition, segment_id FROM visits INNER JOIN urls on urls.id = visits.url"
        
visitsDF = pd.read_sql_query(query, conn)
print visitsDF.tail() # shows the last 5 entries in the Chrome history

# Step 3: Create two time moments for Nov 10, 2017 at 2:20 and 2:40, that will
# represent the time interval we were doing Google searches in class
# Because the timestamps are in UTC, we will be adding 5 hours

startTime = datetime.datetime(2017, 11, 10, 19, 20)
endTime = datetime.datetime(2017, 11, 10, 19, 40)

startTimeMS = convertToChromeTime(startTime)
endTimeMS = convertToChromeTime(endTime)

# Step 4: Filter the dataframe to return only visits within the time interval

ourDF = visitsDF[visitsDF['visit_time'].apply(lambda val: val > startTimeMS and
                                                          val < endTimeMS)]
print ourDF.shape
print ourDF.head()
                                                          
                                                        
# Step 5: Store this data into a CSV file for inspection

name = raw_input("Enter your first name (e.g., eni): ")
ourDF.to_csv("results-{}.csv".format(name))
print "Task COMPLETED. Check CSV file to make sure that it has the results."