# getBrowserSearches.py # Author: Eni Mustafaraj # Date: Nov 12, 2017 # Purpose: Extract a portion of the browser history belonging to a class # activity in CS 234, on Friday, Nov 10, 2017. import pandas as pd import sqlite3 # HISTORY FILE # IMPORTANT: put your USERNAME instead of 'emustafa'. In a Mac computer, the # username is shown in the upper-right corner, next to the date, or in a terminal # window is the name closest to the command prompt. Don't confuse it with your # Machine Name. The user name shouldn't have spaces in it. pathToHistory = "/Users/emustafa/Library/Application Support/Google/Chrome/Default/History" # ALTERNATIVE """ Go to chrome://history/ in your browser, find the file History, download it and then use the path to that file. """ ############################## HELPER FUNCTIONS ################################ import datetime from dateutil import tz def convertChromeTime(ms): """Convert the amount of microsends into a datetime object. Google chrome doesn't use Unix epoch. """ return datetime.datetime(1601, 1, 1) + datetime.timedelta(microseconds=ms) def convertLocalTime(datetimeObj): """Convert a datetime object into the local datetime.""" # 1. find timezones fromZone = tz.gettz('UTC') toZone = tz.gettz('America/New_York') # 2. assign the UTC timezone to original datetime object, because it's naive dtUTC = datetimeObj.replace(tzinfo=fromZone) # 3. convert to local time return dtUTC.astimezone(toZone) def convertToChromeTime(dateObj): """Convert a datetime object to microseconds that show Google Chrome time.""" chromeEpoch = datetime.datetime(1601, 1, 1) return int((dateObj - chromeEpoch).total_seconds())*1000000 ################################################################################ # Step 1: Connect to the History file; create a cursor conn = sqlite3.connect(pathToHistory) cursor = conn.cursor() # Step 2: Read data from urls and visits into a dataframe query = "SELECT urls.url, urls.title, visit_time, visit_duration, from_visit," + \ "transition, segment_id FROM visits INNER JOIN urls on urls.id = visits.url" visitsDF = pd.read_sql_query(query, conn) print visitsDF.tail() # shows the last 5 entries in the Chrome history # Step 3: Create two time moments for Nov 10, 2017 at 2:20 and 2:40, that will # represent the time interval we were doing Google searches in class # Because the timestamps are in UTC, we will be adding 5 hours startTime = datetime.datetime(2017, 11, 10, 19, 20) endTime = datetime.datetime(2017, 11, 10, 19, 40) startTimeMS = convertToChromeTime(startTime) endTimeMS = convertToChromeTime(endTime) # Step 4: Filter the dataframe to return only visits within the time interval ourDF = visitsDF[visitsDF['visit_time'].apply(lambda val: val > startTimeMS and val < endTimeMS)] print ourDF.shape print ourDF.head() # Step 5: Store this data into a CSV file for inspection name = raw_input("Enter your first name (e.g., eni): ") ourDF.to_csv("results-{}.csv".format(name)) print "Task COMPLETED. Check CSV file to make sure that it has the results."