getData.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99

import pandas as pd
import requests
from utils import osSpecific  #  Some functions to delete and create directoryes needed for data storing
from dotenv import load_dotenv
import os

'''
This python script fetches NBA teams and its players (currently playing and retired) and some
additional information about them. 
The data is stored in the current working directory (any existing "Data" file is overwritten.
Data is in csv format.

Author: Rasmus Luha
Created_at: 16.03.2022
Data fetched from: https://rapidapi.com/theapiguy/api/free-nba/
'''

# Loading API key from environment variables.
load_dotenv()
API_KEY = os.getenv("API_KEY")

# To use this script, you should make .env file in current dir and add there: API_KEY = your API_key
# You can get the API key from url listed above.

# API request details
url = "https://free-nba.p.rapidapi.com/"
headers = {
            'x-rapidapi-host': "free-nba.p.rapidapi.com",
            'x-rapidapi-key': API_KEY
          }

# File name variables to store data in 
if osSpecific.whichOs() == "windows":
    teamsFile = "Data/NBAteams.csv"
    playersDir = "Data\Players\\"
else:
    teamsFile = "Data/NBAteams.csv"
    playersDir = "Data/Players/"

# Create new Data directory in order to avoid duplicates, when data is requested multiple times 
osSpecific.deleteDataDir()
osSpecific.addDataDir()


###### Functions ######

def getTeamsData(url, headers):

    querystring = {"page":"0"}
    response = requests.request("GET", url+"teams", headers=headers, params=querystring)

    teamsDf = pd.DataFrame(response.json()["data"])
    teamsDf.set_index("id")
    teamsDf = teamsDf.drop("id", axis=1)
    teamsDf.to_csv(teamsFile)
    print("Teams data stored in Data directory as \"NBAteams.csv\"")


def getPlayerData(url, headers):

    # First request is made just to get the amount of pages that must be looped through
    querystring = {"per_page":"100","page":"0"}
    response = requests.request("GET", url+"players", headers=headers, params=querystring)
    pageCount = response.json()["meta"]["total_pages"]
    
    
    for el in range(1, pageCount+1):
    
        # Requesting pages in loop till pageCount
        querystring = {"per_page":"100","page":el}
        response = requests.request("GET", url+"players", headers=headers, params=querystring)
        data = response.json()["data"]
        
        for player in data:
            teamName = player["team"]["full_name"]
            playerDf = pd.DataFrame(columns=["first_name", "last_name", "position", "height_feet", "height_inches"])
    
            playerSeries = pd.Series({"first_name" : player["first_name"],
                                      "last_name" : player["last_name"],
                                      "position" : player["position"],
                                      "height_feet" : player["height_feet"],
                                      "height_inches" : player["height_inches"]})
    
            #add player to dataframe
            playerDf.loc[len(playerDf)] = playerSeries 
            #add dataframe to File
            playerDf.to_csv(playersDir+teamName+".csv", mode='a', index=False, header=False)
    
        print("Page "+str(el)+" read.") 
    print("All done, check \"Data\" Dir.")


# Requesting and storing data
if __name__ == "__main__":
    getTeamsData(url, headers)
    getPlayerData(url, headers)