import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


df = pd.read_csv("../data/NBA Advanced Stats(2019 - 2024).csv")
print(f"Loaded NBA Advanced Stats with {df.shape[0]} rows.")

print("First 5 rows of the NBA Advanced Stats dataset:")
print(df.head())

print("\nDataset Info")
print(df.info())

print("\nSummary Statictics")
print(df.describe())

print("\nMissing values in each column:")
print(df.isnull().sum())

print("\nColumn names:")
print(df.columns)
Loaded NBA Advanced Stats with 4296 rows.
First 5 rows of the NBA Advanced Stats dataset:
   Unnamed: 0  Season         Player   Age Team Pos     G    GS      MP   PER  \
0           0    2019   Bradley Beal  25.0  WAS  SG  82.0  82.0  3028.0  20.8   
1           1    2019   James Harden  29.0  HOU  PG  78.0  78.0  2867.0  30.6   
2           2    2019   Kemba Walker  28.0  CHO  PG  82.0  82.0  2863.0  21.7   
3           3    2019  Tobias Harris  26.0  2TM  PF  82.0  82.0  2847.0  17.2   
4           4    2019  Tobias Harris  26.0  LAC  PF  55.0  55.0  1903.0  18.2   

   ...  USG%   OWS  DWS    WS  WS/48  OBPM  DBPM   BPM  VORP         Awards  
0  ...  28.4   5.9  1.7   7.6  0.120   3.8  -0.8   2.9   3.7             AS  
1  ...  40.5  11.4  3.8  15.2  0.254   9.4   1.6  11.0   9.3  MVP-2,AS,NBA1  
2  ...  31.5   5.5  1.9   7.4  0.123   5.2  -1.0   4.2   4.4        AS,NBA3  
3  ...  22.8   4.5  2.6   7.1  0.120   2.1  -0.5   1.6   2.6            NaN  
4  ...  23.5   3.2  1.8   5.0  0.127   2.7  -0.2   2.4   2.1            NaN  

[5 rows x 30 columns]

Dataset Info
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4296 entries, 0 to 4295
Data columns (total 30 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Unnamed: 0  4296 non-null   int64  
 1   Season      4296 non-null   int64  
 2   Player      4296 non-null   object 
 3   Age         4290 non-null   float64
 4   Team        4290 non-null   object 
 5   Pos         4290 non-null   object 
 6   G           4290 non-null   float64
 7   GS          4290 non-null   float64
 8   MP          4290 non-null   float64
 9   PER         4290 non-null   float64
 10  TS%         4262 non-null   float64
 11  3PAr        4260 non-null   float64
 12  FTr         4260 non-null   float64
 13  ORB%        4296 non-null   float64
 14  DRB%        4296 non-null   float64
 15  TRB%        4296 non-null   float64
 16  AST%        4296 non-null   float64
 17  STL%        4296 non-null   float64
 18  BLK%        4296 non-null   float64
 19  TOV%        4266 non-null   float64
 20  USG%        4296 non-null   float64
 21  OWS         4290 non-null   float64
 22  DWS         4290 non-null   float64
 23  WS          4290 non-null   float64
 24  WS/48       4290 non-null   float64
 25  OBPM        4290 non-null   float64
 26  DBPM        4290 non-null   float64
 27  BPM         4290 non-null   float64
 28  VORP        4290 non-null   float64
 29  Awards      319 non-null    object 
dtypes: float64(24), int64(2), object(4)
memory usage: 1007.0+ KB
None

Summary Statictics
        Unnamed: 0       Season          Age            G           GS  \
count  4296.000000  4296.000000  4290.000000  4290.000000  4290.000000   
mean   2147.500000  2021.537942    26.006760    39.828205    18.212587   
std    1240.292707     1.699265     4.171169    24.704211    24.255182   
min       0.000000  2019.000000    19.000000     1.000000     0.000000   
25%    1073.750000  2020.000000    23.000000    17.000000     0.000000   
50%    2147.500000  2022.000000    25.000000    41.000000     5.000000   
75%    3221.250000  2023.000000    29.000000    62.000000    29.000000   
max    4295.000000  2024.000000    43.000000    84.000000    83.000000   

                MP          PER          TS%         3PAr          FTr  ...  \
count  4290.000000  4290.000000  4262.000000  4260.000000  4260.000000  ...   
mean    896.133100    12.638718     0.540602     0.398038     0.249558  ...   
std     766.699467     6.843749     0.114524     0.223305     0.205775  ...   
min       1.000000   -45.200000     0.000000     0.000000     0.000000  ...   
25%     199.000000     9.500000     0.506000     0.260750     0.141000  ...   
50%     707.500000    12.600000     0.553000     0.408000     0.223000  ...   
75%    1475.750000    16.100000     0.596000     0.544250     0.316000  ...   
max    3028.000000    80.300000     1.500000     1.000000     4.000000  ...   

              TOV%         USG%          OWS          DWS           WS  \
count  4266.000000  4296.000000  4290.000000  4290.000000  4290.000000   
mean     12.468401    18.234916     0.949301     0.895128     1.846340   
std       6.858016     5.869599     1.592891     0.935413     2.342021   
min       0.000000     0.000000    -3.000000    -0.500000    -1.900000   
25%       9.200000    14.400000     0.000000     0.200000     0.100000   
50%      11.800000    17.600000     0.300000     0.600000     1.000000   
75%      14.800000    21.400000     1.400000     1.400000     2.800000   
max     100.000000    72.500000    12.200000     5.900000    17.000000   

             WS/48         OBPM         DBPM          BPM         VORP  
count  4290.000000  4290.000000  4290.000000  4290.000000  4290.000000  
mean      0.071516    -1.675664    -0.241865    -1.917902     0.423357  
std       0.108847     4.057487     2.058934     5.220443     1.075243  
min      -1.231000   -52.400000   -31.100000   -81.400000    -2.000000  
25%       0.035000    -3.300000    -1.100000    -3.700000    -0.100000  
50%       0.079000    -1.400000    -0.200000    -1.500000     0.000000  
75%       0.122000     0.300000     0.700000     0.500000     0.600000  
max       1.261000    40.100000    32.700000    52.000000    10.600000  

[8 rows x 26 columns]

Missing values in each column:
Unnamed: 0       0
Season           0
Player           0
Age              6
Team             6
Pos              6
G                6
GS               6
MP               6
PER              6
TS%             34
3PAr            36
FTr             36
ORB%             0
DRB%             0
TRB%             0
AST%             0
STL%             0
BLK%             0
TOV%            30
USG%             0
OWS              6
DWS              6
WS               6
WS/48            6
OBPM             6
DBPM             6
BPM              6
VORP             6
Awards        3977
dtype: int64

Column names:
Index(['Unnamed: 0', 'Season', 'Player', 'Age', 'Team', 'Pos', 'G', 'GS', 'MP',
       'PER', 'TS%', '3PAr', 'FTr', 'ORB%', 'DRB%', 'TRB%', 'AST%', 'STL%',
       'BLK%', 'TOV%', 'USG%', 'OWS', 'DWS', 'WS', 'WS/48', 'OBPM', 'DBPM',
       'BPM', 'VORP', 'Awards'],
      dtype='object')
df_per_ts = df[['Season','Player','PER','TS%']].dropna()
save_path_PER = "../data/PER-TS%.csv"
df_per_ts.to_csv(save_path_PER,index=False)
print(df_per_ts)

df_salary = pd.read_csv("../data/NBA Salaries(2019-2024).csv")
print(df_salary.head())
      Season              Player   PER    TS%
0       2019        Bradley Beal  20.8  0.581
1       2019        James Harden  30.6  0.616
2       2019        Kemba Walker  21.7  0.558
3       2019       Tobias Harris  17.2  0.592
4       2019       Tobias Harris  18.2  0.605
...      ...                 ...   ...    ...
4286    2024     Markquis Nowell  55.5  0.532
4287    2024         Isaiah Wong  -1.1  0.333
4289    2024  Izaiah Brockington  26.6  0.400
4291    2024      Jalen Crutcher -12.6  0.000
4292    2024   Dmytro Skapintsev -19.3  0.000

[4256 rows x 4 columns]
   Unnamed: 0         playerName      2019/20   2019/20(*)      2020/21  \
0           0      Stephen Curry  $40,231,758  $49,346,703  $43,006,362   
1           1  Russell Westbrook  $38,506,482  $47,230,546  $41,358,814   
2           2         Chris Paul  $38,506,482  $47,230,546  $41,358,814   
3           3       James Harden  $38,199,000  $46,853,401  $41,254,920   
4           4          John Wall  $38,199,000  $46,853,401  $41,254,920   

    2020/21(*)      2021/22   2021/22(*)      2022/23   2022/23(*)  \
0  $52,411,485  $45,780,966  $52,938,707  $48,070,014  $50,968,059   
1  $50,403,633  $44,211,146  $51,123,449  $47,080,179  $49,918,548   
2  $50,403,633  $30,800,000  $35,615,504  $28,400,000  $30,112,179   
3  $50,277,018  $44,310,840  $51,238,730  $33,000,000  $34,989,504   
4  $50,277,018  $44,310,840  $51,238,730  $47,345,760  $50,200,141   

       2023/24   2023/24(*)  
0  $51,915,615  $53,458,234  
1   $3,835,738   $3,949,713  
2  $30,800,000  $31,715,190  
3  $35,680,595  $36,740,807  
4          NaN          NaN  
df_2019_20 = df_salary[['playerName','2019/20']].dropna()
df_2020_21 = df_salary[['playerName','2020/21']].dropna()
df_2021_22 = df_salary[['playerName','2021/22']].dropna()
df_2022_23 = df_salary[['playerName','2022/23']].dropna()
df_2023_24 = df_salary[['playerName','2023/24']].dropna()
df_2019_20.rename(columns={'playerName':'Player_name'}, inplace=True)
df_2020_21.rename(columns={'playerName':'Player_name'}, inplace=True)
df_2021_22.rename(columns={'playerName':'Player_name'}, inplace=True)
df_2022_23.rename(columns={'playerName':'Player_name'}, inplace=True)
df_2023_24.rename(columns={'playerName':'Player_name'}, inplace=True)

df_selectd = df[['Season','Player','Pos']]
df_selectd.rename(columns={'Player':'player_name','Season':'season'}, inplace=True)
save_path_6 = "../data/player_pos.csv"
df_selectd.to_csv(save_path_6, index=False)
C:\Users\27508\AppData\Local\Temp\ipykernel_38744\1694157263.py:13: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_selectd.rename(columns={'Player':'player_name','Season':'season'}, inplace=True)
df_pos = pd.read_csv("../data/player_pos.csv")

seasons = df_pos["season"].unique()

for season in seasons:
    df_season = df_pos[df_pos["season"] == season]
    file_name = f"season_{season}.csv"
    save_path_7 = f"../data/{file_name}"
    df_season.to_csv(save_path_7, index=False)

df_pos_2019 = pd.read_csv("../data/season_2019.csv")
df_pos_2020 = pd.read_csv("../data/season_2020.csv")
df_pos_2021 = pd.read_csv("../data/season_2021.csv")
df_pos_2022 = pd.read_csv("../data/season_2022.csv")

df_merged_data_2019 = pd.read_csv("../data/merged_data_19-20.csv")
df_merged_data_2020 = pd.read_csv("../data/merged_data_20-21.csv")
df_merged_data_2021 = pd.read_csv("../data/merged_data_21-22.csv")
df_merged_data_2022 = pd.read_csv("../data/merged_data_22-23.csv")

df_merged_data_pos_2019 = pd.merge(df_merged_data_2019, df_pos_2019, on='player_name', how='left').dropna(subset=['Pos'])
df_merged_data_pos_2020 = pd.merge(df_merged_data_2020, df_pos_2020, on='player_name', how='left').dropna(subset=['Pos'])
df_merged_data_pos_2021 = pd.merge(df_merged_data_2021, df_pos_2021, on='player_name', how='left').dropna(subset=['Pos'])
df_merged_data_pos_2022 = pd.merge(df_merged_data_2022, df_pos_2022, on='player_name', how='left').dropna(subset=['Pos'])

save_path_8 = "../data/merged_data_pos-2019.csv"
save_path_9 = "../data/merged_data_pos-2020.csv"
save_path_10 = "../data/merged_data_pos-2021.csv"
save_path_11 = "../data/merged_data_pos-2022.csv"
df_merged_data_pos_2019.to_csv(save_path_8, index=False)
df_merged_data_pos_2020.to_csv(save_path_9, index=False)
df_merged_data_pos_2021.to_csv(save_path_10, index=False)
df_merged_data_pos_2022.to_csv(save_path_11, index=False)