#Import necessary packages
import os
import warnings
from tqdm import tqdm
import socceraction.spadl as spadl
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
warnings.simplefilter(action='ignore', category=pd.errors.PerformanceWarning)

#Import all the necessary dataframes
xP = pd.read_csv("xPactions.csv", index_col = 0)
fb = pd.read_csv("teamsFOTMOB.csv", index_col = 0)
positions = pd.read_csv("clustered_position.csv", index_col = 0)

players0 = pd.read_csv("players2021.csv", index_col = 0)
players1 = pd.read_csv("players2122.csv", index_col = 0)
players2 = pd.read_csv("players2223.csv", index_col = 0)
players3 = pd.read_csv("players2324.csv", index_col = 0)
players4 = pd.read_csv("players2425.csv", index_col = 0)

games0 = pd.read_csv("games2021.csv", index_col = 0)
games1 = pd.read_csv("games2122.csv", index_col = 0)
games2 = pd.read_csv("games2223.csv", index_col = 0)
games3 = pd.read_csv("games2324.csv", index_col = 0)
games4 = pd.read_csv("games2425.csv", index_col = 0)

actions0 = pd.read_csv("actions2021.csv", index_col = 0)
actions1 = pd.read_csv("actions2122.csv", index_col = 0)
actions2 = pd.read_csv("actions2223.csv", index_col = 0)
actions3 = pd.read_csv("actions2324.csv", index_col = 0)
actions4 = pd.read_csv("actions2425.csv", index_col = 0)

#Concat together the same dataframes from different seasons
players = pd.concat([players0, players1, players2, players3, players4])
games = pd.concat([games0, games1, games2, games3, games4])
actions = pd.concat([actions0, actions1, actions2, actions3, actions4])

#Adjust the action_id of the action dataframe as it started from 1 with each session
actions.drop(columns=['action_id'], inplace=True)
actions.reset_index(drop=True, inplace=True)
actions.reset_index(inplace=True)
actions.rename(columns={'index': 'action_id'}, inplace=True)

# Convert 'minutes_played' to total minutes with error handling
def convert_to_minutes(time_str):
    try:
        # Convert to string in case it's a float (e.g., NaN)
        time_str = str(time_str)
        # Split the time string into minutes and seconds
        minutes, seconds = map(int, time_str.split(':'))
        # Convert total time to minutes (seconds converted to fraction of minutes)
        return minutes + seconds / 60
    except (ValueError, AttributeError):
        # Handle cases where the conversion fails (e.g., NaN or bad format)
        return 0  # or use `np.nan` if you prefer to mark as missing

# Apply the conversion function to the 'minutes_played' column
players['minutes_played_converted'] = players['minutes_played'].apply(convert_to_minutes)

#Merging together players with identifiers and names for teams
players = players.merge(fb, how="left")

#Creating a table to have minutes played for players in each season and team
mp = players.groupby(["player_name", "player_id", "team_name",
                      "fotmob_id", "season_id"])["minutes_played_converted"].sum().reset_index(name='minutes_played')

#We filter the games dataframe to contain only certain informations
games0 = games[["game_id", "competition_id", "season_id"]]

#Using the spadl/vaep framework to add names on the ids in columns
actions0 = spadl.add_names(actions)

#Merging together actions + games info + players and teams infos + xP values
actions1 = (actions0
           .merge(xP, how="left")
           .merge(fb, how="left")
           .merge(games0, how="left"))

#We also merge the minutes played in the season for the player on the actions
actions = actions1.merge(mp, how="left")

#Creating a column of outcome for successful actions (they are flagged as 1)
actions['outcome'] = np.where((actions["result_name"] == 'success'), True, False)

#We do the necessary calculation to calculate the PPDA each team faced from opposition in each game
#We first store for each game how many passes a team attempted in theirs first 60% of the pitch
passes = actions[actions['type_name'] == 'pass']
passes0 = passes[passes['start_x'] <= 63]
X0 = passes0.groupby(["team_id", "team_name", "game_id", "competition_id", "season_id"])["type_name"].count().reset_index(name='passes')

#We then store how many defensive actions a team attempted in the last 60% of the pitch
defactions0 = actions[actions['type_name'].isin(['tackle', 'foul', 'interception'])]
defactions = defactions0[defactions0['start_x'] >= 42]
X1 = defactions.groupby(["team_id", "team_name", "game_id", "competition_id", "season_id"])["type_name"].count().reset_index(name='defactions')

#This two will be the necessary values to find the PPDA of the opponents as we want to know 
#how many passes per defensive actions the team was allowed to attempt

# Step 1: Create a dictionary to hold game_id to defensive actions of each team
game_teams = X0.groupby('game_id').apply(lambda x: x[['passes', 'team_id', 'team_name',]].to_dict('records')).to_dict()

# Step 2: Define a function to get both opponent defactions and team_id/name
def get_opponent_info(row):
    game_id = row['game_id']
    team_id = row['team_id']
    
    # Get all teams in the current game
    teams_in_game = game_teams[game_id]
    
     # Find the opponent team (the team that isn't the current one)
    opponent = [team for team in teams_in_game if team['team_id'] != team_id]
    
    # If we found an opponent, return both defactions and team_id/team_name
    if opponent:
        opponent = opponent[0]
        return opponent['passes'], opponent['team_id'], opponent['team_name']
    return None, None, None

# Step 3: Apply the function to get opponent defactions
X0['opponent_passes'], X0['opponent_id'], X0['opponent'], = zip(*X0.apply(get_opponent_info, axis=1))

/var/folders/ns/3wxdg4g57h77vxwmr4wzmvt40000gn/T/ipykernel_8484/2710470026.py:2: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.
  game_teams = X0.groupby('game_id').apply(lambda x: x[['passes', 'team_id', 'team_name',]].to_dict('records')).to_dict()

# Step 1: Create a dictionary to hold game_id to both team_id and defactions
game_teams = X1.groupby('game_id').apply(lambda x: x[['defactions', 'team_id', 'team_name',]].to_dict('records')).to_dict()

# Step 2: Define a function to get both opponent defactions and team_id/name
def get_opponent_info(row):
    game_id = row['game_id']
    team_id = row['team_id']
    
    # Get all teams in the current game
    teams_in_game = game_teams[game_id]
    
    # Find the opponent team (the team that isn't the current one)
    opponent = [team for team in teams_in_game if team['team_id'] != team_id]
    
    # If we found an opponent, return both defactions and team_id/team_name
    if opponent:
        opponent = opponent[0]
        return opponent['defactions'], opponent['team_id'], opponent['team_name']
    return None, None, None

# Step 3: Apply the function and unpack the results into three new columns
X1['opponent_defactions'], X1['opponent_id'], X1['opponent'], = zip(*X1.apply(get_opponent_info, axis=1))

/var/folders/ns/3wxdg4g57h77vxwmr4wzmvt40000gn/T/ipykernel_8484/2738209691.py:2: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.
  game_teams = X1.groupby('game_id').apply(lambda x: x[['defactions', 'team_id', 'team_name',]].to_dict('records')).to_dict()

# Perform a left merge so we have a table with both passes and defensive actions for team and opponents
X = X0.merge(X1, how="left", on=["team_id", "team_name", "game_id", "competition_id", "season_id", "opponent_id", "opponent"])

X

#We calculate for each game both the team PPDA and opponent's
X['opponent_PPDA'] = X['passes'] / X['opponent_defactions']
X['PPDA'] = X['opponent_passes'] / X['defactions']

#We add dates to the table so we can visualize how intense opponents have been over time (if we want)
dates = games[['game_id', 'game_date']]
Ya = (X.merge(dates, how="left"))

#We filter the table to have the final one
Y = Ya[['team_id', 'team_name', 'game_id', 'game_date', 'season_id', 'opponent_id', 'opponent', 'opponent_PPDA']]

#We find the lowest 30% of intense PPDA so to have a threshold (would be the 30% most intense)
quantiles = Y.groupby('team_id')['opponent_PPDA'].quantile(0.3).reset_index()

# Rename the column to indicate it's the threshold
quantiles = quantiles.rename(columns={'opponent_PPDA': 'opponent_PPDA_thrs'})

# Merge the threshold data back into the original DataFrame
Z = Y.merge(quantiles, on='team_id', how='left')

# Filter for team_name and sort by game_date for the visualization
df_team = Z[Z['team_name'] == 'Juventus'].sort_values(by='game_date')

# Plotting
plt.figure(figsize=(20,10))
plt.plot(df_team['game_date'], df_team['opponent_PPDA'], marker='o', linestyle='-', color='grey')
quantile_value = df_team['opponent_PPDA_thrs'].iloc[0]
plt.axhline(y=quantile_value, color='red', linestyle='--', zorder=2)

# Adding labels and title
plt.xlabel('Game Date')
plt.ylabel('Opponent PPDA')
plt.title(f'Opponent PPDA for {df_team["team_name"].iloc[0]} from {df_team["season_id"].min()} to {df_team["season_id"].max()}')

# Annotate each point with the opponent's name
for i in range(len(df_team)):
    plt.annotate(df_team['opponent'].iloc[i], 
                 (df_team['game_date'].iloc[i], df_team['opponent_PPDA'].iloc[i]),
                 textcoords="offset points", xytext=(0,5), ha='center')

# Invert the y-axis
plt.gca().invert_yaxis()

# Reduce number of x-axis ticks
n = 10  # Show every 3rd date, adjust as needed
plt.xticks(df_team['game_date'][::n], rotation=45)

# Show plot
plt.xticks(rotation=90)
plt.tight_layout()
plt.show()

#We now do the necessary calculations to see how players passing % change against pressing intensive teams
#and use that as a proxy of player ability under pressure

#We calculate number of attempted and successful passes per game for players
passes['PAx'] = passes['outcome'] - passes['xP']
P0 = passes.groupby(["team_id", "team_name", "player_id", "player_name", "game_id",
                     "season_id"])["xP"].sum().reset_index()
P1 = (passes.groupby(["team_id", "team_name", "player_id", "player_name", "game_id", 
                      "season_id"], observed = True)["type_name"].count().reset_index(name='attempted_passes'))

passes1 = passes[passes["result_name"] == 'success']
P2 = (passes1.groupby(["team_id", "team_name", "player_id", "player_name", "game_id",
                       "season_id"], observed = True)["type_name"].count().reset_index(name='successful_passes'))

/var/folders/ns/3wxdg4g57h77vxwmr4wzmvt40000gn/T/ipykernel_8484/4274909673.py:2: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  passes['PAx'] = passes['outcome'] - passes['xP']

#we merge to have a single dataframe and then fillna
P = (P0
    .merge(P1, how="left")
    .merge(P2, how="left"))

P[['xP', 'attempted_passes', 'successful_passes']] = P[['xP', 'attempted_passes', 'successful_passes']].fillna(0)

#We filter down the precedently made table and merge it with cluster positions
mpa = mp[['player_name', 'player_id', 'team_name', 'season_id', 'minutes_played']]
mpa = mpa.merge(positions, how='left')

#Then we merge it on the table of players passes attempted and completed
finalP = P.merge(mpa, how="left").merge(Y, how="left")
finalP

#We find the lowest 30% of intense PPDA of oppontent's for players in season
quantiles = finalP.groupby(['team_id', 'player_id', 'season_id'])['opponent_PPDA'].quantile(0.3).reset_index()

# Rename the column to indicate it's the 20th percentile
quantiles = quantiles.rename(columns={'opponent_PPDA': 'opponent_PPDA_thrs'})

#We merge the threshold on the table
finalP = finalP.merge(quantiles, how='left')

# Step 1: Categorize matches based on PPDA threshold
finalP['ppda_category'] = finalP['opponent_PPDA'].le(finalP['opponent_PPDA_thrs'])  # True if PPDA ≤ threshold, else False

# Step 2: Group and aggregate
aggregated = (
    finalP.groupby(['team_id', 'team_name', 'player_id', 'player_name', 'season_id', 'ppda_category'])
      .agg(
          xP_sum=('xP', 'sum'),
          attempted_passes_sum=('attempted_passes', 'sum'),
          successful_passes_sum=('successful_passes', 'sum')
      )
      .reset_index()
)

# Step 3: Separate results into "low-PPDA" and "high-PPDA"
low_ppda = aggregated[aggregated['ppda_category'] == True]
high_ppda = aggregated[aggregated['ppda_category'] == False]

#We calculate expected and actual percentage both for games below and over the threshold
high_ppda['xP%'] = (high_ppda['xP_sum'] / high_ppda['attempted_passes_sum']) * 100
high_ppda['Passes %'] = (high_ppda['successful_passes_sum'] /high_ppda['attempted_passes_sum']) * 100

low_ppda['xP%'] = (low_ppda['xP_sum'] / low_ppda['attempted_passes_sum']) * 100
low_ppda['Passes %'] = (low_ppda['successful_passes_sum'] /low_ppda['attempted_passes_sum']) * 100

/var/folders/ns/3wxdg4g57h77vxwmr4wzmvt40000gn/T/ipykernel_8484/3043594213.py:2: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  high_ppda['xP%'] = (high_ppda['xP_sum'] / high_ppda['attempted_passes_sum']) * 100
/var/folders/ns/3wxdg4g57h77vxwmr4wzmvt40000gn/T/ipykernel_8484/3043594213.py:3: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  high_ppda['Passes %'] = (high_ppda['successful_passes_sum'] /high_ppda['attempted_passes_sum']) * 100
/var/folders/ns/3wxdg4g57h77vxwmr4wzmvt40000gn/T/ipykernel_8484/3043594213.py:5: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  low_ppda['xP%'] = (low_ppda['xP_sum'] / low_ppda['attempted_passes_sum']) * 100
/var/folders/ns/3wxdg4g57h77vxwmr4wzmvt40000gn/T/ipykernel_8484/3043594213.py:6: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  low_ppda['Passes %'] = (low_ppda['successful_passes_sum'] /low_ppda['attempted_passes_sum']) * 100

#We filter both dataframe for below and over the threshold
high_ppda = high_ppda[['team_id', 'team_name', 'player_id', 'player_name', 'season_id', 'Passes %']]
low_ppda = low_ppda[['team_id', 'team_name', 'player_id', 'player_name', 'season_id', 'Passes %']]

# Renaming the `Passes %` column to distinguish between the two dataframes
df1 = high_ppda.rename(columns={'Passes %': 'Passes %'})
df2 = low_ppda.rename(columns={'Passes %': 'Pressured_Passes %'})

# We then merge the two
merged_df = df1.merge(df2)

# Finally we calculate the difference the player have under pressure
merged_df['press_change_pass_pct'] = merged_df['Pressured_Passes %'] - merged_df['Passes %']

#To explore the result we merge the players info dataframe and look at what positions we have
merged_df = merged_df.merge(mpa, how='left')
merged_df.position_group.unique()

array(['CDM', 'CB', 'Undefined', 'AMW', 'GK', 'WB', 'ST', nan],
      dtype=object)

#We then filter for player with at least 1000 minutes played in 2425, go for the best under pressure to list and select position
X = merged_df.sort_values(by=['press_change_pass_pct'], ascending=False)
X = X[(X['minutes_played'] > 1000) & (X['season_id'] == 2425)]
X[X['position_group'].isin(['CB'])].head(20)

#We then select the columns we want to keep from the non filtered dataframe and save it for later use
to_use = merged_df[['team_name', 'player_id', 'player_name', 'season_id', 'press_change_pass_pct']]
to_use.to_csv("underpressureproxy.csv")

	team_id	team_name	game_id	competition_id	season_id	passes	opponent_passes	opponent_id	opponent	defactions	opponent_defactions
0	1	Brondby IF	1592218	EU-Europa League	2122	208	246.0	354.0	Sparta Prague	32.0	34.0
1	1	Brondby IF	1592220	EU-Europa League	2122	130	509.0	228.0	Lyon	39.0	18.0
2	1	Brondby IF	1592222	EU-Europa League	2122	314	366.0	306.0	Rangers	33.0	32.0
3	1	Brondby IF	1592224	EU-Europa League	2122	179	299.0	306.0	Rangers	26.0	26.0
4	1	Brondby IF	1592226	EU-Europa League	2122	161	416.0	228.0	Lyon	22.0	23.0
...	...	...	...	...	...	...	...	...	...	...	...
59797	30664	St. Louis City	1791167	USA-Major League Soccer	2024	166	289.0	1122.0	San Jose	32.0	27.0
59798	30664	St. Louis City	1791196	USA-Major League Soccer	2024	219	205.0	11134.0	Vancouver	24.0	30.0
59799	30664	St. Louis City	1791214	USA-Major League Soccer	2024	149	326.0	3624.0	Houston	35.0	30.0
59800	30664	St. Louis City	1791224	USA-Major League Soccer	2024	222	232.0	11134.0	Vancouver	28.0	34.0
59801	30664	St. Louis City	1791466	USA-Major League Soccer	2024	358	195.0	9293.0	Minnesota United	26.0	28.0

	team_id	team_name	player_id	player_name	game_id	season_id	xP	attempted_passes	successful_passes	minutes_played	position	position_group	game_date	opponent_id	opponent	opponent_PPDA
0	1	Brondby IF	108942.0	Josip Radosevic	1592218	2122	41.013142	49	41.0	456.700000	CM	CDM	2021-09-16 20:00:00	354.0	Sparta Prague	6.117647
1	1	Brondby IF	108942.0	Josip Radosevic	1592220	2122	28.089613	35	27.0	456.700000	CM	CDM	2021-09-30 17:45:00	228.0	Lyon	7.222222
2	1	Brondby IF	108942.0	Josip Radosevic	1592222	2122	26.208827	32	26.0	456.700000	CM	CDM	2021-10-21 20:00:00	306.0	Rangers	9.812500
3	1	Brondby IF	108942.0	Josip Radosevic	1592224	2122	43.584918	53	39.0	456.700000	CM	CDM	2021-11-04 17:45:00	306.0	Rangers	6.884615
4	1	Brondby IF	108942.0	Josip Radosevic	1592226	2122	28.035128	35	27.0	456.700000	CM	CDM	2021-11-25 20:00:00	228.0	Lyon	7.000000
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
891915	30664	St. Louis City	533552.0	Jake Girdwood-Reich	1791090	2024	2.507253	4	2.0	195.083333	DM	CDM	2024-08-25 03:30:00	11133.0	Portland	6.533333
891916	30664	St. Louis City	533552.0	Jake Girdwood-Reich	1791129	2024	2.120877	3	3.0	195.083333	DM	CDM	2024-09-29 01:30:00	1116.0	Kansas City	5.909091
891917	30664	St. Louis City	533552.0	Jake Girdwood-Reich	1791146	2024	33.403613	40	35.0	195.083333	DM	CDM	2024-10-03 03:30:00	27482.0	Los Angeles FC	9.147059
891918	30664	St. Louis City	533552.0	Jake Girdwood-Reich	1791167	2024	4.820073	6	6.0	195.083333	DM	CDM	2024-09-22 03:30:00	1122.0	San Jose	6.148148
891919	30664	St. Louis City	533552.0	Jake Girdwood-Reich	1791466	2024	0.888112	1	1.0	195.083333	DM	CDM	2024-10-20 02:00:00	9293.0	Minnesota United	12.785714

	team_id	team_name	player_id	player_name	season_id	Passes %	Pressured_Passes %	press_change_pass_pct	minutes_played	position	position_group
27736	838	Las Palmas	497198.0	Juan Herzog	2425	82.558140	92.307692	9.749553	1122.483333	RCB	CB
38175	2157	Accrington	432338.0	Zach Awe	2425	62.696629	71.084337	8.387708	2281.816667	LCB	CB
16149	203	Lincoln City	144375.0	Adam Jackson	2425	67.073171	73.157895	6.084724	2423.500000	RCB	CB
8125	98	Exeter	134174.0	Ben Purrington	2425	67.936508	73.303167	5.366659	1215.633333	LCB	CB
1303	22	Bradford	469637.0	Ciarán Kelly	2425	65.237020	70.588235	5.351215	1514.066667	LCB	CB
19758	258	Cheltenham	382792.0	Ibrahim Bakare	2425	55.379747	60.447761	5.068014	1856.900000	RCB	CB
1585	25	Sheff Wed	83778.0	Liam Palmer	2425	84.352078	89.285714	4.933636	1259.033333	RCB	CB
9872	128	FC Utrecht	101605.0	Mike van der Hoorn	2425	80.915287	85.675676	4.760388	2488.600000	RCB	CB
31815	990	Stevenage	134494.0	Nathan Thompson	2425	80.625000	85.234899	4.609899	1366.733333	RCB	CB
38554	2188	Estoril	436897.0	Pedro Álvaro	2425	84.417178	88.857939	4.440761	2624.666667	RCB	CB
12370	163	Sheff Utd	312670.0	Hamza Choudhury	2425	81.081081	85.401460	4.320379	1111.950000	RCB	CB
3277	41	Stuttgart	416149.0	Anthony Rouault	2425	89.125296	92.822967	3.697671	1970.500000	RCB	CB
15523	193	Cambridge U	374630.0	Kelland Watts	2425	59.449541	62.978723	3.529182	2294.433333	LCB	CB
24006	318	Hearts	421328.0	Craig Halkett	2425	88.059701	91.304348	3.244646	1010.900000	RCB	CB
5678	71	Bologna	349406.0	Nicolò Casale	2425	87.309645	90.551181	3.241536	1019.366667	LCB	CB
9692	122	Boavista	444822.0	Pedro Gomes	2425	75.636364	78.761062	3.124698	1552.850000	RCB	CB
46063	28583	Dynamo Makhachkala	531314.0	Idar Shumakhov	2425	74.774775	77.431907	2.657132	2423.183333	RCB	CB
1553	24	Aston Villa	408946.0	Lamare Bogarde	2425	90.267640	92.857143	2.589503	1035.550000	RCB	CB
23062	305	St. Johnstone	542627.0	Bozo Mikulic	2425	83.301344	85.792350	2.491006	1262.800000	LCB	CB
13353	171	QPR	121613.0	Morgan Fox	2425	79.120879	81.560284	2.439405	1501.500000	LCB	CB