"""
A script for scraping the lyrics to an artist's songs on Genius.
Adapted from https://chrishyland.github.io/scraping-from-genius

Step 1: insert genius API token after import statements
Step 2: Find artist id. Use this url, changing "mitski" to the target artist.
        https://api.genius.com/search?q=mitski&access_token=g6ldNsYBVUxz3T7xp_eiFA9_wW6SWfxjoZxjM5u5K1DjKn5fg0G0ojj84fE1doTV 
        If the search succeeds, you should be able to find the artist's id.

Step 3: Create a text file containing the names of albums by the artist. I used only studio albums as listed on Wikipedia.
Step 4: Run this program from the command-line, passing in the artist's name, id, and the location of the album list as arguments.
"""

import requests
from bs4 import BeautifulSoup
import os
import re
import csv
import sys

base = "https://api.genius.com"
GENIUS_API_TOKEN='g6ldNsYBVUxz3T7xp_eiFA9_wW6SWfxjoZxjM5u5K1DjKn5fg0G0ojj84fE1doTV'

def connect_lyrics(song_id):
    '''Constructs the path of song lyrics.'''
    url = "songs/{}".format(song_id)
    data = get_json(url)

    # Gets the path of song lyrics
    path = data['response']['song']['path']

    return path

def get_json(path, params=None, headers=None):
    '''Send request and get response in json format.'''

    # Generate request URL
    requrl = '/'.join([base, path])
    token = "Bearer {}".format(GENIUS_API_TOKEN)
    if headers:
        headers['Authorization'] = token
    else:
        headers = {"Authorization": token}

    # Get response object from querying genius api
    response = requests.get(url=requrl, params=params, headers=headers)
    response.raise_for_status()
    return response.json()

def get_song_id(artist_id):
    '''Get all the song id from an artist.'''
    current_page = 1
    next_page = True
    songs = [] # to store final song ids

    while next_page:
        path = "artists/{}/songs/".format(artist_id)
        params = {'page': current_page} # the current page
        data = get_json(path=path, params=params) # get json of songs

        page_songs = data['response']['songs']
        if page_songs:
            # Add all the songs of current page
            songs += page_songs
            # Increment current_page value for next loop
            current_page += 1
            print("Page {} finished scraping".format(current_page))
            # If you don't want to scrape all pages, un-comment this
            #if current_page == 3:
            #	break

        else:
            # If page_songs is empty, quit
            next_page = False

    print("Song id were scraped from {} pages".format(current_page))

    # Get all the song ids, excluding not-primary-artist songs.
    songs = [song["id"] for song in songs
            if song["primary_artist"]["id"] == artist_id]

    return songs

def retrieve_lyrics(song_id):
    '''Retrieves lyrics from html page.'''
    path = connect_lyrics(song_id)

    URL = "http://genius.com" + path
    print(URL)
    page = requests.get(URL)
    

    # Extract the page's HTML as a string
    html = BeautifulSoup(page.text, "html.parser")
    #print(html)

    # Scrape the song lyrics from the HTML
    
    if not html.find("div", class_="Lyrics__Container-sc-1ynbvzw-6 YYrds") == None:
    	lyrics = html.find("div", class_="Lyrics__Container-sc-1ynbvzw-6 YYrds").get_text("|")
    	lines = [l for l in lyrics.split('|') if l != '']
    	return [l for l in lines if l[0].strip() != '[']
    elif not html.find("div", class_="Lyrics__Container-sc-1ynbvzw-1 kUgSbL") == None:
        lyrics = html.find("div", class_="Lyrics__Container-sc-1ynbvzw-1 kUgSbL").get_text("|")
        lines = [l for l in lyrics.split('|') if l != '']
        return [l for l in lines if l[0].strip() != '[']
    else:
        print("Failed to find",song_id)
        return []
    return

def get_song_information(song_ids,album_list):
    '''Retrieve meta data about a song.'''
    # initialize a dictionary.
    song_list = {}
    album_list = [a.lower() for a in album_list]
    for i, song_id in enumerate(song_ids):
        path = "songs/{}".format(song_id)
        data = get_json(path=path)["response"]["song"]

        stats = {
            "title": data["title"],
            "album": data["album"]["name"] if data["album"] else "<single>",
            "release_date": data["release_date"].split('-')[0] if data["release_date"] else "unidentified",
            "genius_track_id": song_id,
            "genius_album_id": data["album"]["id"] if data["album"] else "none"}
        if stats["album"].strip().lower() in album_list:
        	song_list[song_id] = stats
        else:
        	#print(stats["title"])
        	print(stats["album"])

    return song_list

def write_lyrics_to_file(lines,artist_name):
	with open("lyrics/" + artist_name.lower() + '.csv', 'w') as f:  
		csvwriter = csv.writer(f) 
		for line in lines:
			csvwriter.writerow(line)

def main():
    if len(sys.argv ) < 4:
        print("Usage: python scrape_lyrics.py artistName artistID albumFile")
        return 

    term = sys.argv[1]
    artist_id = int(sys.argv[2])
    with open(sys.argv[3],'r') as albumF:
        album_list = [a.strip() for a in albumF.readlines()]

    """
    # Example searches
    term = "Beyoncé"
    artist_id = 498
    album_list = "album_lists/bey_albums.txt"
    
    term = "Mitski"
    artist_id = 265258
    album_list = "album_lists/mitski_albums.txt"

    term = "Swift"
    artist_id = 1177
    album_list = "album_lists/swift_albums.txt"
    """
    lines = []

    # Grabs all song id's from artist
    songs_ids = get_song_id(artist_id)
    print(f"Number of songs found: {len(songs_ids)}")

    # Get meta information about songs
    song_dict = get_song_information(songs_ids,album_list)
    print(f"Number of songs matched: {len(song_dict)}")

    # Scrape lyrics from the songs
    for song,info in song_dict.items():
        lyrics = retrieve_lyrics(song)
        for i,lyric in enumerate(lyrics):
            lines.append([term,info["album"],info["title"],lyric,str(i),info["release_date"]])
    write_lyrics_to_file(lines,term)


main()
