Import all packages and authenticate Spotipy credentials

import pandas as pd
import numpy as np

import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
import spotipy.util as util

username = 'drogers771'
scope=''

ccm = SpotifyClientCredentials(client_id=my_client_id,
                               client_secret=my_client_secret)

my_token = util.prompt_for_user_token(username,
                           scope,
                           client_id=my_client_id,
                           client_secret=my_client_secret,
                           redirect_uri='http://localhost:8889')
from spotipy.oauth2 import SpotifyClientCredentials
import sys
import spotipy

sp = spotipy.Spotify(client_credentials_manager=ccm)

def get_artist(name):
    results = sp.search(q='artist:' + name, type='artist',market='US')
    items = results['artists']['items']
    return items[0]

artist = get_artist('The Beatles')

Grab all the albums for our artist above

albums = []
results = sp.artist_albums(artist['id'], album_type='album')
albums.extend(results['items'])
while results['next']:
    results = sp.next(results)
    albums.extend(results['items'])
    
unique = []
album_id = []


for album in albums:
    name = album['name'].lower()
    id_name = album['id']
    if name not in unique:
        unique.append(name)
        album_id.append(id_name)
 
print('Total albums on Spotify:', len(unique))

Total albums on Spotify: 23

Grab all the songs on each of their albums

tracks = []
ids = []
Artist=[]
Album=[]
Track=[]
Track_Length=[]

for i in album_id:
    results = sp.album_tracks(album_id=i)
    tracks.extend(results['items'])
while results['next']:
    results = sp.next(results)
    tracks.extend(results['items'])
for track in tracks:
    ids.append(track['id'])
    Track.append(track['name'])
    Artist.append(track['artists'][0]['name'])
    Track_Length.append(track['duration_ms']/60000)   
    
df_tracks = pd.DataFrame({'Artist':Artist,
                        'Track':Track,
                        'Track Length':Track_Length,
                        'Track ID':ids})
print(df_tracks.shape)
(534, 4)

Go back and grab some features such as when the album was released and tracks’ popularity

temp=[]
Artist=[]
Album_ID=[]
Album=[]
Album_Release=[]
Track=[]
Track_ID=[]
Popularity=[]

for row in df_tracks['Track ID']:
    results = sp.track(track_id=row)
    temp.append(results)
for i in temp:
    Artist.append(i['artists'][0]['name'])
    Album.append(i['album']['name'])
    Album_ID.append(i['album']['id'])
    Album_Release.append(i['album']['release_date'])
    Track.append(i['name'])
    Track_ID.append(i['id'])
    Popularity.append(i['popularity'])
df_albums = pd.DataFrame({'Artist':Artist,
                          'Album':Album,
                          'Album ID':Album_ID,
                          'Album Release':Album_Release,
                           #'Track':Track,
                          'Track ID':Track_ID,
                          'Popularity':Popularity})
print(df_albums.shape)
retrying ...1secs
retrying ...1secs
(534, 6)

Grab features of each song like Energy, Loudness, etc.

audio = []
Danceability = []
Energy = []
Key = []
Loudness = []
Speechiness = []
Acousticness = []
Instrumentalness = []
Liveness = []
Valence = []
Tempo = []
track_id = []

for i in ids:
    results = sp.audio_features(i)
    audio.extend(results) 
for i,j in enumerate(audio):
    track_id.append(j['id'])
    Danceability.append(j['danceability'])
    Energy.append(j['energy'])
    Key.append(j['key'])
    Loudness.append(j['loudness'])
    Speechiness.append(j['speechiness'])
    Acousticness.append(j['acousticness'])
    Instrumentalness.append(j['instrumentalness'])
    Liveness.append(j['liveness'])
    Valence.append(j['valence'])
    Tempo.append(j['tempo'])
    
df_audio = pd.DataFrame({'Track ID': track_id,
                        'Danceability':Danceability,
                        'Energy':Energy,
                        'Key':Key,
                        'Loudness':Loudness,
                        'Speechiness':Speechiness,
                        'Acousticness':Acousticness,
                        'Instrumentalness':Instrumentalness,
                        'Liveness':Liveness,
                        'Valence':Valence,
                        'Tempo':Tempo})
print(df_audio.shape)
retrying ...1secs
retrying ...1secs
retrying ...1secs
retrying ...1secs
(534, 11)

Merge the previous two dataframes

df = df_tracks.merge(df_audio,on='Track ID')
df = df.merge(df_albums, on='Track ID')

Show the first five entries of the dataset

df
Artist_x Track Track Length Track ID Danceability Energy Key Loudness Speechiness Acousticness Instrumentalness Liveness Valence Tempo Artist_y Album Album ID Album Release Popularity
0 The Beatles Come Together - 2019 Mix 4.336667 6lSxM9BKcEZBSDKl2VODsF 0.536 0.360 9 -10.973 0.0408 0.0823 0.167000 0.0996 0.147 164.891 The Beatles Abbey Road (Super Deluxe Edition) 5iT3F2EhjVQVrO4PKhsP8c 2019-09-27 57
1 The Beatles Something - 2019 Mix 3.037100 3S6N0Wbem9KV3DBcYNfXuv 0.416 0.385 0 -9.388 0.0279 0.0958 0.000005 0.2280 0.458 133.298 The Beatles Abbey Road (Super Deluxe Edition) 5iT3F2EhjVQVrO4PKhsP8c 2019-09-27 56
2 The Beatles Maxwell's Silver Hammer - 2019 Mix 3.466217 0fnY9xlLJCgtBUBX9rNzDJ 0.816 0.386 2 -9.443 0.0382 0.4970 0.000093 0.3090 0.708 131.099 The Beatles Abbey Road (Super Deluxe Edition) 5iT3F2EhjVQVrO4PKhsP8c 2019-09-27 53
3 The Beatles Oh! Darling - 2019 Mix 3.452433 3UHv8SSIkNUDRBUHJx3Cg6 0.437 0.669 4 -6.524 0.0395 0.0344 0.003950 0.3470 0.514 173.670 The Beatles Abbey Road (Super Deluxe Edition) 5iT3F2EhjVQVrO4PKhsP8c 2019-09-27 55
4 The Beatles Octopus's Garden - 2019 Mix 2.846667 3e1w0Wm0sH8nUYPArDkBG3 0.565 0.645 1 -6.194 0.0332 0.1130 0.000625 0.1240 0.626 92.253 The Beatles Abbey Road (Super Deluxe Edition) 5iT3F2EhjVQVrO4PKhsP8c 2019-09-27 53
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
529 The Beatles Baby It's You - Remastered 2009 2.675333 2VmB1rF9FtfKUmFHDVnq8Q 0.608 0.494 4 -12.211 0.0345 0.7780 0.000000 0.0926 0.879 112.421 The Beatles Please Please Me (Remastered) 3KzAvEXcqJKBF97HrXwlgf 1963-03-22 58
530 The Beatles Do You Want To Know A Secret - Remastered 2009 1.950217 7Aobt67JnaF7qN8jCCKvHq 0.673 0.349 4 -12.414 0.0368 0.6080 0.000000 0.3800 0.609 124.451 The Beatles Please Please Me (Remastered) 3KzAvEXcqJKBF97HrXwlgf 1963-03-22 61
531 The Beatles A Taste Of Honey - Remastered 2009 2.058000 7fh53ta3vAOGJMQ4i5tCHe 0.420 0.372 1 -11.416 0.0327 0.6980 0.000000 0.1040 0.412 101.408 The Beatles Please Please Me (Remastered) 3KzAvEXcqJKBF97HrXwlgf 1963-03-22 47
532 The Beatles There's A Place - Remastered 2009 1.841550 4dessGxnKXmTbHPhVgqODq 0.455 0.582 4 -10.009 0.0292 0.6290 0.000004 0.1720 0.927 140.928 The Beatles Please Please Me (Remastered) 3KzAvEXcqJKBF97HrXwlgf 1963-03-22 47
533 The Beatles Twist And Shout - Remastered 2009 2.587100 5ZBeML7Lf3FMEVviTyvi8l 0.482 0.849 2 -9.198 0.0452 0.6410 0.000008 0.0414 0.937 124.631 The Beatles Please Please Me (Remastered) 3KzAvEXcqJKBF97HrXwlgf 1963-03-22 73

534 rows × 19 columns

Create new feature for year the album was released.

df['Release Year'] = df['Album Release'].str.split('-').str[0]

Attempting to get all tracknames to be written the same way. Some have (Live) or ‘- Remastered 2009’. Additionally, some tracks were just entered incorrectly in Spotify.

df['Track_New'] = df['Track'].str.replace(' \[| \(', '-').str.split('-',expand = True)[0].str.rstrip().str.lower()
df.groupby('Track_New').mean()['Track Length']

pd.set_option('display.max_rows', df.shape[0]+1)
df['Track_New'].value_counts()

df['Track_New'].describe()
count                                       534
unique                                      275
top       sgt. pepper's lonely hearts club band
freq                                         11
Name: Track_New, dtype: object

It looks like there are a lot of duplicated songs. There may be some rereleases or greatest hits in the data that’s causing songs to appear more than once.

import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style="darkgrid")
plt.figure(figsize=(10,6))
a = sns.boxplot(x=df['Release Year'], y=df['Track Length'])
a.set_prop_cycle(color=['blue'])

png

Track length dramatically changes after 1966! It looks like a few albums were release after 1970. Let’s see if the duplications go away when I remove thos albums.

df = df[df['Release Year']<='1970']
df['Track'].describe()
count                                            274
unique                                           244
top       She's Leaving Home - Take 1 / Instrumental
freq                                               2
Name: Track, dtype: object
plt.figure(figsize=(10,6))
a=sns.boxplot(x=df['Release Year'], y=df['Track Length'])
a.set_prop_cycle(color=['red'])

png

a=sns.distplot(df['Popularity'],bins=60)
a.set_prop_cycle(color=['red'])

png

Now, I want to take a look at how the features are correlated. It doesn’t look like any of our features are highly correlated with our targe ‘ Popularity’

corr_df = df.corr()
plt.figure(figsize=(10,6))
a=sns.heatmap(corr_df, cmap="Blues")
a.set_prop_cycle(color=['red'])

png

sns.pairplot(df)
<seaborn.axisgrid.PairGrid at 0x1b456a84648>

png

df['Release Year'] = df['Release Year'].astype(str)
C:\Users\roger\Anaconda3\lib\site-packages\ipykernel_launcher.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.

Modeling

Importing packages that I will need

from prettytable import PrettyTable
# Sklearn model selection
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
# Sklearn metrics
from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_squared_log_error
# Sklearn models
from sklearn.linear_model import Lasso, ElasticNet, Ridge, SGDRegressor
from sklearn.svm import SVR, NuSVR
from sklearn.ensemble import BaggingRegressor, RandomForestRegressor
from sklearn.neighbors import KNeighborsClassifier
from sklearn.cluster import KMeans

from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier

from sklearn.linear_model import LinearRegression

import random

%matplotlib inline

# Make results reproducible
random.seed(100)
df.columns
Index(['Artist_x', 'Track', 'Track Length', 'Track ID', 'Danceability',
       'Energy', 'Key', 'Loudness', 'Speechiness', 'Acousticness',
       'Instrumentalness', 'Liveness', 'Valence', 'Tempo', 'Artist_y', 'Album',
       'Album ID', 'Album Release', 'Popularity', 'Release Year', 'Track_New'],
      dtype='object')
x_columns = ['Track Length', 'Danceability','Energy', 'Key', 'Loudness', 'Speechiness', 'Acousticness',
       'Instrumentalness', 'Liveness', 'Valence', 'Tempo','Release Year']
y_column = df['Popularity']

Splitting data into training and testing sets

X_train, X_test, y_train, y_test = train_test_split(df[x_columns], 
                                                    y_column, 
                                                    test_size=0.2, 
                                                    random_state=42)

Running the data through 9 different models to see which model predicts my test data

table = PrettyTable()
table.field_names = ["Model", "Mean Squared Error", "R² score"]

models = [
    Lasso(alpha=0.1),
    ElasticNet(random_state=0),
    Ridge(alpha=.5),
    SVR(gamma='auto', kernel='linear'),
    SVR(gamma='auto', kernel='rbf'),
    BaggingRegressor(),
    BaggingRegressor(KNeighborsClassifier(), max_samples=0.5, max_features=0.5),
    NuSVR(gamma='auto'),
    RandomForestRegressor( random_state=0, n_estimators=300)
]

for model in models:
    model.fit(X_train, y_train) 
    y_res = model.predict(X_test)

    mse = mean_squared_error(y_test, y_res)
    score = model.score(X_test, y_test)    

    table.add_row([type(model).__name__, format(mse, '.2f'), format(score, '.2f')])

print(table)
+-----------------------+--------------------+----------+
|         Model         | Mean Squared Error | R² score |
+-----------------------+--------------------+----------+
|         Lasso         |       162.69       |  -0.06   |
|       ElasticNet      |       174.85       |  -0.14   |
|         Ridge         |       163.65       |  -0.07   |
|          SVR          |       178.25       |  -0.16   |
|          SVR          |       167.16       |  -0.09   |
|    BaggingRegressor   |       156.23       |  -0.02   |
|    BaggingRegressor   |       260.56       |  -0.70   |
|         NuSVR         |       178.73       |  -0.16   |
| RandomForestRegressor |       117.07       |   0.24   |
+-----------------------+--------------------+----------+

The random forest model has the lowest MSE and highest R². Below I am going to see if I can improve on that model

# Table setup
table = PrettyTable()
table.field_names = ["Model", "Dataset", "MSE", "MAE", 'RMSLE', "R² score"]
# Model training
model = RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=4,
           min_weight_fraction_leaf=0.0, n_estimators=200, n_jobs=None,
           oob_score=False, random_state=None, verbose=0, warm_start=False)
model.fit(X_train, y_train) 

def evaluate(x, y, dataset):
    pred = model.predict(x)

    mse = mean_squared_error(y, pred)
    mae = mean_absolute_error(y, pred)
    score = model.score(x, y)    
    rmsle = np.sqrt(mean_squared_log_error(y, pred))

    table.add_row([type(model).__name__, dataset, format(mse, '.2f'), format(mae, '.2f'), format(rmsle, '.2f'), format(score, '.2f')])
    

evaluate(X_train, y_train, 'training')
evaluate(X_test, y_test, 'validation')

print(table)
+-----------------------+------------+--------+------+-------+----------+
|         Model         |  Dataset   |  MSE   | MAE  | RMSLE | R² score |
+-----------------------+------------+--------+------+-------+----------+
| RandomForestRegressor |  training  | 14.67  | 2.90 |  0.09 |   0.92   |
| RandomForestRegressor | validation | 119.15 | 8.50 |  0.21 |   0.22   |
+-----------------------+------------+--------+------+-------+----------+

Feature importance

importances = model.feature_importances_
std = np.std([tree.feature_importances_ for tree in model.estimators_], axis=0)
indices = np.argsort(importances)[::-1]
print("Feature ranking:")

for f in range(X_test.shape[1]):
    print("%d. feature %s (%f)" % (f + 1, x_columns[indices[f]], importances[indices[f]]))
Feature ranking:
1. feature Release Year (0.238756)
2. feature Speechiness (0.220008)
3. feature Loudness (0.097321)
4. feature Valence (0.069859)
5. feature Energy (0.058355)
6. feature Tempo (0.057656)
7. feature Acousticness (0.057093)
8. feature Danceability (0.050881)
9. feature Liveness (0.049203)
10. feature Track Length (0.038286)
11. feature Instrumentalness (0.033633)
12. feature Key (0.028948)

Plot the feature importances of the forest

plt.figure(figsize=(14,5))
plt.title("Feature importances")
plt.bar(range(X_test.shape[1]), importances[indices], color="cornflowerblue", yerr=std[indices], align="center")
plt.xticks(range(X_test.shape[1]), [x_columns[i] for i in indices],rotation=45)
plt.xlim([-1, X_test.shape[1]])
plt.show()

png