In [1]:
#Import necessary packages
import os
import warnings
from tqdm import tqdm
import socceraction.spadl as spadl
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
warnings.simplefilter(action='ignore', category=pd.errors.PerformanceWarning)
In [2]:
#Import all the necessary dataframes
xP = pd.read_csv("xPactions.csv", index_col = 0)
fb = pd.read_csv("teamsFOTMOB.csv", index_col = 0)
positions = pd.read_csv("clustered_position.csv", index_col = 0)
In [3]:
players0 = pd.read_csv("players2021.csv", index_col = 0)
players1 = pd.read_csv("players2122.csv", index_col = 0)
players2 = pd.read_csv("players2223.csv", index_col = 0)
players3 = pd.read_csv("players2324.csv", index_col = 0)
players4 = pd.read_csv("players2425.csv", index_col = 0)
In [4]:
games0 = pd.read_csv("games2021.csv", index_col = 0)
games1 = pd.read_csv("games2122.csv", index_col = 0)
games2 = pd.read_csv("games2223.csv", index_col = 0)
games3 = pd.read_csv("games2324.csv", index_col = 0)
games4 = pd.read_csv("games2425.csv", index_col = 0)
In [5]:
actions0 = pd.read_csv("actions2021.csv", index_col = 0)
actions1 = pd.read_csv("actions2122.csv", index_col = 0)
actions2 = pd.read_csv("actions2223.csv", index_col = 0)
actions3 = pd.read_csv("actions2324.csv", index_col = 0)
actions4 = pd.read_csv("actions2425.csv", index_col = 0)
In [6]:
#Concat together the same dataframes from different seasons
players = pd.concat([players0, players1, players2, players3, players4])
games = pd.concat([games0, games1, games2, games3, games4])
actions = pd.concat([actions0, actions1, actions2, actions3, actions4]) 
In [7]:
#Adjust the action_id of the action dataframe as it started from 1 with each session
actions.drop(columns=['action_id'], inplace=True)
actions.reset_index(drop=True, inplace=True)
actions.reset_index(inplace=True)
actions.rename(columns={'index': 'action_id'}, inplace=True)
In [8]:
# Convert 'minutes_played' to total minutes with error handling
def convert_to_minutes(time_str):
    try:
        # Convert to string in case it's a float (e.g., NaN)
        time_str = str(time_str)
        # Split the time string into minutes and seconds
        minutes, seconds = map(int, time_str.split(':'))
        # Convert total time to minutes (seconds converted to fraction of minutes)
        return minutes + seconds / 60
    except (ValueError, AttributeError):
        # Handle cases where the conversion fails (e.g., NaN or bad format)
        return 0  # or use `np.nan` if you prefer to mark as missing

# Apply the conversion function to the 'minutes_played' column
players['minutes_played_converted'] = players['minutes_played'].apply(convert_to_minutes)
In [9]:
#Merging together players with identifiers and names for teams
players = players.merge(fb, how="left")
In [10]:
#Creating a table to have minutes played for players in each season and team
mp = players.groupby(["player_name", "player_id", "team_name",
                      "fotmob_id", "season_id"])["minutes_played_converted"].sum().reset_index(name='minutes_played')
In [11]:
#We filter the games dataframe to contain only certain informations
games0 = games[["game_id", "competition_id", "season_id"]]
In [12]:
#Using the spadl/vaep framework to add names on the ids in columns
actions0 = spadl.add_names(actions)
In [13]:
#Merging together actions + games info + players and teams infos + xP values
actions1 = (actions0
           .merge(xP, how="left")
           .merge(fb, how="left")
           .merge(games0, how="left"))
In [14]:
#We also merge the minutes played in the season for the player on the actions
actions = actions1.merge(mp, how="left")
In [15]:
#Creating a column of outcome for successful actions (they are flagged as 1)
actions['outcome'] = np.where((actions["result_name"] == 'success'), True, False)
In [16]:
#We do the necessary calculation to calculate the PPDA each team faced from opposition in each game
#We first store for each game how many passes a team attempted in theirs first 60% of the pitch
passes = actions[actions['type_name'] == 'pass']
passes0 = passes[passes['start_x'] <= 63]
X0 = passes0.groupby(["team_id", "team_name", "game_id", "competition_id", "season_id"])["type_name"].count().reset_index(name='passes')

#We then store how many defensive actions a team attempted in the last 60% of the pitch
defactions0 = actions[actions['type_name'].isin(['tackle', 'foul', 'interception'])]
defactions = defactions0[defactions0['start_x'] >= 42]
X1 = defactions.groupby(["team_id", "team_name", "game_id", "competition_id", "season_id"])["type_name"].count().reset_index(name='defactions')

#This two will be the necessary values to find the PPDA of the opponents as we want to know 
#how many passes per defensive actions the team was allowed to attempt
In [17]:
# Step 1: Create a dictionary to hold game_id to defensive actions of each team
game_teams = X0.groupby('game_id').apply(lambda x: x[['passes', 'team_id', 'team_name',]].to_dict('records')).to_dict()

# Step 2: Define a function to get both opponent defactions and team_id/name
def get_opponent_info(row):
    game_id = row['game_id']
    team_id = row['team_id']
    
    # Get all teams in the current game
    teams_in_game = game_teams[game_id]
    
     # Find the opponent team (the team that isn't the current one)
    opponent = [team for team in teams_in_game if team['team_id'] != team_id]
    
    # If we found an opponent, return both defactions and team_id/team_name
    if opponent:
        opponent = opponent[0]
        return opponent['passes'], opponent['team_id'], opponent['team_name']
    return None, None, None

# Step 3: Apply the function to get opponent defactions
X0['opponent_passes'], X0['opponent_id'], X0['opponent'], = zip(*X0.apply(get_opponent_info, axis=1))
/var/folders/ns/3wxdg4g57h77vxwmr4wzmvt40000gn/T/ipykernel_8484/2710470026.py:2: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.
  game_teams = X0.groupby('game_id').apply(lambda x: x[['passes', 'team_id', 'team_name',]].to_dict('records')).to_dict()
In [18]:
# Step 1: Create a dictionary to hold game_id to both team_id and defactions
game_teams = X1.groupby('game_id').apply(lambda x: x[['defactions', 'team_id', 'team_name',]].to_dict('records')).to_dict()

# Step 2: Define a function to get both opponent defactions and team_id/name
def get_opponent_info(row):
    game_id = row['game_id']
    team_id = row['team_id']
    
    # Get all teams in the current game
    teams_in_game = game_teams[game_id]
    
    # Find the opponent team (the team that isn't the current one)
    opponent = [team for team in teams_in_game if team['team_id'] != team_id]
    
    # If we found an opponent, return both defactions and team_id/team_name
    if opponent:
        opponent = opponent[0]
        return opponent['defactions'], opponent['team_id'], opponent['team_name']
    return None, None, None

# Step 3: Apply the function and unpack the results into three new columns
X1['opponent_defactions'], X1['opponent_id'], X1['opponent'], = zip(*X1.apply(get_opponent_info, axis=1))
/var/folders/ns/3wxdg4g57h77vxwmr4wzmvt40000gn/T/ipykernel_8484/2738209691.py:2: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.
  game_teams = X1.groupby('game_id').apply(lambda x: x[['defactions', 'team_id', 'team_name',]].to_dict('records')).to_dict()
In [19]:
# Perform a left merge so we have a table with both passes and defensive actions for team and opponents
X = X0.merge(X1, how="left", on=["team_id", "team_name", "game_id", "competition_id", "season_id", "opponent_id", "opponent"])
In [20]:
X
Out[20]:
team_id team_name game_id competition_id season_id passes opponent_passes opponent_id opponent defactions opponent_defactions
0 1 Brondby IF 1592218 EU-Europa League 2122 208 246.0 354.0 Sparta Prague 32.0 34.0
1 1 Brondby IF 1592220 EU-Europa League 2122 130 509.0 228.0 Lyon 39.0 18.0
2 1 Brondby IF 1592222 EU-Europa League 2122 314 366.0 306.0 Rangers 33.0 32.0
3 1 Brondby IF 1592224 EU-Europa League 2122 179 299.0 306.0 Rangers 26.0 26.0
4 1 Brondby IF 1592226 EU-Europa League 2122 161 416.0 228.0 Lyon 22.0 23.0
... ... ... ... ... ... ... ... ... ... ... ...
59797 30664 St. Louis City 1791167 USA-Major League Soccer 2024 166 289.0 1122.0 San Jose 32.0 27.0
59798 30664 St. Louis City 1791196 USA-Major League Soccer 2024 219 205.0 11134.0 Vancouver 24.0 30.0
59799 30664 St. Louis City 1791214 USA-Major League Soccer 2024 149 326.0 3624.0 Houston 35.0 30.0
59800 30664 St. Louis City 1791224 USA-Major League Soccer 2024 222 232.0 11134.0 Vancouver 28.0 34.0
59801 30664 St. Louis City 1791466 USA-Major League Soccer 2024 358 195.0 9293.0 Minnesota United 26.0 28.0

59802 rows × 11 columns

In [21]:
#We calculate for each game both the team PPDA and opponent's
X['opponent_PPDA'] = X['passes'] / X['opponent_defactions']
X['PPDA'] = X['opponent_passes'] / X['defactions']
In [22]:
#We add dates to the table so we can visualize how intense opponents have been over time (if we want)
dates = games[['game_id', 'game_date']]
Ya = (X.merge(dates, how="left"))
In [23]:
#We filter the table to have the final one
Y = Ya[['team_id', 'team_name', 'game_id', 'game_date', 'season_id', 'opponent_id', 'opponent', 'opponent_PPDA']]
In [24]:
#We find the lowest 30% of intense PPDA so to have a threshold (would be the 30% most intense)
quantiles = Y.groupby('team_id')['opponent_PPDA'].quantile(0.3).reset_index()

# Rename the column to indicate it's the threshold
quantiles = quantiles.rename(columns={'opponent_PPDA': 'opponent_PPDA_thrs'})

# Merge the threshold data back into the original DataFrame
Z = Y.merge(quantiles, on='team_id', how='left')
In [25]:
# Filter for team_name and sort by game_date for the visualization
df_team = Z[Z['team_name'] == 'Juventus'].sort_values(by='game_date')

# Plotting
plt.figure(figsize=(20,10))
plt.plot(df_team['game_date'], df_team['opponent_PPDA'], marker='o', linestyle='-', color='grey')
quantile_value = df_team['opponent_PPDA_thrs'].iloc[0]
plt.axhline(y=quantile_value, color='red', linestyle='--', zorder=2)

# Adding labels and title
plt.xlabel('Game Date')
plt.ylabel('Opponent PPDA')
plt.title(f'Opponent PPDA for {df_team["team_name"].iloc[0]} from {df_team["season_id"].min()} to {df_team["season_id"].max()}')

# Annotate each point with the opponent's name
for i in range(len(df_team)):
    plt.annotate(df_team['opponent'].iloc[i], 
                 (df_team['game_date'].iloc[i], df_team['opponent_PPDA'].iloc[i]),
                 textcoords="offset points", xytext=(0,5), ha='center')

# Invert the y-axis
plt.gca().invert_yaxis()

# Reduce number of x-axis ticks
n = 10  # Show every 3rd date, adjust as needed
plt.xticks(df_team['game_date'][::n], rotation=45)

# Show plot
plt.xticks(rotation=90)
plt.tight_layout()
plt.show()
No description has been provided for this image
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [26]:
#We now do the necessary calculations to see how players passing % change against pressing intensive teams
#and use that as a proxy of player ability under pressure
In [27]:
#We calculate number of attempted and successful passes per game for players
passes['PAx'] = passes['outcome'] - passes['xP']
P0 = passes.groupby(["team_id", "team_name", "player_id", "player_name", "game_id",
                     "season_id"])["xP"].sum().reset_index()
P1 = (passes.groupby(["team_id", "team_name", "player_id", "player_name", "game_id", 
                      "season_id"], observed = True)["type_name"].count().reset_index(name='attempted_passes'))

passes1 = passes[passes["result_name"] == 'success']
P2 = (passes1.groupby(["team_id", "team_name", "player_id", "player_name", "game_id",
                       "season_id"], observed = True)["type_name"].count().reset_index(name='successful_passes'))
/var/folders/ns/3wxdg4g57h77vxwmr4wzmvt40000gn/T/ipykernel_8484/4274909673.py:2: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  passes['PAx'] = passes['outcome'] - passes['xP']
In [28]:
#we merge to have a single dataframe and then fillna
P = (P0
    .merge(P1, how="left")
    .merge(P2, how="left"))

P[['xP', 'attempted_passes', 'successful_passes']] = P[['xP', 'attempted_passes', 'successful_passes']].fillna(0)
In [29]:
#We filter down the precedently made table and merge it with cluster positions
mpa = mp[['player_name', 'player_id', 'team_name', 'season_id', 'minutes_played']]
mpa = mpa.merge(positions, how='left')
In [30]:
#Then we merge it on the table of players passes attempted and completed
finalP = P.merge(mpa, how="left").merge(Y, how="left")
finalP
Out[30]:
team_id team_name player_id player_name game_id season_id xP attempted_passes successful_passes minutes_played position position_group game_date opponent_id opponent opponent_PPDA
0 1 Brondby IF 108942.0 Josip Radosevic 1592218 2122 41.013142 49 41.0 456.700000 CM CDM 2021-09-16 20:00:00 354.0 Sparta Prague 6.117647
1 1 Brondby IF 108942.0 Josip Radosevic 1592220 2122 28.089613 35 27.0 456.700000 CM CDM 2021-09-30 17:45:00 228.0 Lyon 7.222222
2 1 Brondby IF 108942.0 Josip Radosevic 1592222 2122 26.208827 32 26.0 456.700000 CM CDM 2021-10-21 20:00:00 306.0 Rangers 9.812500
3 1 Brondby IF 108942.0 Josip Radosevic 1592224 2122 43.584918 53 39.0 456.700000 CM CDM 2021-11-04 17:45:00 306.0 Rangers 6.884615
4 1 Brondby IF 108942.0 Josip Radosevic 1592226 2122 28.035128 35 27.0 456.700000 CM CDM 2021-11-25 20:00:00 228.0 Lyon 7.000000
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
891915 30664 St. Louis City 533552.0 Jake Girdwood-Reich 1791090 2024 2.507253 4 2.0 195.083333 DM CDM 2024-08-25 03:30:00 11133.0 Portland 6.533333
891916 30664 St. Louis City 533552.0 Jake Girdwood-Reich 1791129 2024 2.120877 3 3.0 195.083333 DM CDM 2024-09-29 01:30:00 1116.0 Kansas City 5.909091
891917 30664 St. Louis City 533552.0 Jake Girdwood-Reich 1791146 2024 33.403613 40 35.0 195.083333 DM CDM 2024-10-03 03:30:00 27482.0 Los Angeles FC 9.147059
891918 30664 St. Louis City 533552.0 Jake Girdwood-Reich 1791167 2024 4.820073 6 6.0 195.083333 DM CDM 2024-09-22 03:30:00 1122.0 San Jose 6.148148
891919 30664 St. Louis City 533552.0 Jake Girdwood-Reich 1791466 2024 0.888112 1 1.0 195.083333 DM CDM 2024-10-20 02:00:00 9293.0 Minnesota United 12.785714

891920 rows × 16 columns

In [31]:
#We find the lowest 30% of intense PPDA of oppontent's for players in season
quantiles = finalP.groupby(['team_id', 'player_id', 'season_id'])['opponent_PPDA'].quantile(0.3).reset_index()

# Rename the column to indicate it's the 20th percentile
quantiles = quantiles.rename(columns={'opponent_PPDA': 'opponent_PPDA_thrs'})
In [32]:
#We merge the threshold on the table
finalP = finalP.merge(quantiles, how='left')
In [33]:
# Step 1: Categorize matches based on PPDA threshold
finalP['ppda_category'] = finalP['opponent_PPDA'].le(finalP['opponent_PPDA_thrs'])  # True if PPDA ≤ threshold, else False

# Step 2: Group and aggregate
aggregated = (
    finalP.groupby(['team_id', 'team_name', 'player_id', 'player_name', 'season_id', 'ppda_category'])
      .agg(
          xP_sum=('xP', 'sum'),
          attempted_passes_sum=('attempted_passes', 'sum'),
          successful_passes_sum=('successful_passes', 'sum')
      )
      .reset_index()
)

# Step 3: Separate results into "low-PPDA" and "high-PPDA"
low_ppda = aggregated[aggregated['ppda_category'] == True]
high_ppda = aggregated[aggregated['ppda_category'] == False]
In [34]:
#We calculate expected and actual percentage both for games below and over the threshold
high_ppda['xP%'] = (high_ppda['xP_sum'] / high_ppda['attempted_passes_sum']) * 100
high_ppda['Passes %'] = (high_ppda['successful_passes_sum'] /high_ppda['attempted_passes_sum']) * 100

low_ppda['xP%'] = (low_ppda['xP_sum'] / low_ppda['attempted_passes_sum']) * 100
low_ppda['Passes %'] = (low_ppda['successful_passes_sum'] /low_ppda['attempted_passes_sum']) * 100
/var/folders/ns/3wxdg4g57h77vxwmr4wzmvt40000gn/T/ipykernel_8484/3043594213.py:2: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  high_ppda['xP%'] = (high_ppda['xP_sum'] / high_ppda['attempted_passes_sum']) * 100
/var/folders/ns/3wxdg4g57h77vxwmr4wzmvt40000gn/T/ipykernel_8484/3043594213.py:3: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  high_ppda['Passes %'] = (high_ppda['successful_passes_sum'] /high_ppda['attempted_passes_sum']) * 100
/var/folders/ns/3wxdg4g57h77vxwmr4wzmvt40000gn/T/ipykernel_8484/3043594213.py:5: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  low_ppda['xP%'] = (low_ppda['xP_sum'] / low_ppda['attempted_passes_sum']) * 100
/var/folders/ns/3wxdg4g57h77vxwmr4wzmvt40000gn/T/ipykernel_8484/3043594213.py:6: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  low_ppda['Passes %'] = (low_ppda['successful_passes_sum'] /low_ppda['attempted_passes_sum']) * 100
In [35]:
#We filter both dataframe for below and over the threshold
high_ppda = high_ppda[['team_id', 'team_name', 'player_id', 'player_name', 'season_id', 'Passes %']]
low_ppda = low_ppda[['team_id', 'team_name', 'player_id', 'player_name', 'season_id', 'Passes %']]
In [36]:
# Renaming the `Passes %` column to distinguish between the two dataframes
df1 = high_ppda.rename(columns={'Passes %': 'Passes %'})
df2 = low_ppda.rename(columns={'Passes %': 'Pressured_Passes %'})

# We then merge the two
merged_df = df1.merge(df2)

# Finally we calculate the difference the player have under pressure
merged_df['press_change_pass_pct'] = merged_df['Pressured_Passes %'] - merged_df['Passes %']
In [37]:
#To explore the result we merge the players info dataframe and look at what positions we have
merged_df = merged_df.merge(mpa, how='left')
merged_df.position_group.unique()
Out[37]:
array(['CDM', 'CB', 'Undefined', 'AMW', 'GK', 'WB', 'ST', nan],
      dtype=object)
In [38]:
#We then filter for player with at least 1000 minutes played in 2425, go for the best under pressure to list and select position
X = merged_df.sort_values(by=['press_change_pass_pct'], ascending=False)
X = X[(X['minutes_played'] > 1000) & (X['season_id'] == 2425)]
X[X['position_group'].isin(['CB'])].head(20)
Out[38]:
team_id team_name player_id player_name season_id Passes % Pressured_Passes % press_change_pass_pct minutes_played position position_group
27736 838 Las Palmas 497198.0 Juan Herzog 2425 82.558140 92.307692 9.749553 1122.483333 RCB CB
38175 2157 Accrington 432338.0 Zach Awe 2425 62.696629 71.084337 8.387708 2281.816667 LCB CB
16149 203 Lincoln City 144375.0 Adam Jackson 2425 67.073171 73.157895 6.084724 2423.500000 RCB CB
8125 98 Exeter 134174.0 Ben Purrington 2425 67.936508 73.303167 5.366659 1215.633333 LCB CB
1303 22 Bradford 469637.0 Ciarán Kelly 2425 65.237020 70.588235 5.351215 1514.066667 LCB CB
19758 258 Cheltenham 382792.0 Ibrahim Bakare 2425 55.379747 60.447761 5.068014 1856.900000 RCB CB
1585 25 Sheff Wed 83778.0 Liam Palmer 2425 84.352078 89.285714 4.933636 1259.033333 RCB CB
9872 128 FC Utrecht 101605.0 Mike van der Hoorn 2425 80.915287 85.675676 4.760388 2488.600000 RCB CB
31815 990 Stevenage 134494.0 Nathan Thompson 2425 80.625000 85.234899 4.609899 1366.733333 RCB CB
38554 2188 Estoril 436897.0 Pedro Álvaro 2425 84.417178 88.857939 4.440761 2624.666667 RCB CB
12370 163 Sheff Utd 312670.0 Hamza Choudhury 2425 81.081081 85.401460 4.320379 1111.950000 RCB CB
3277 41 Stuttgart 416149.0 Anthony Rouault 2425 89.125296 92.822967 3.697671 1970.500000 RCB CB
15523 193 Cambridge U 374630.0 Kelland Watts 2425 59.449541 62.978723 3.529182 2294.433333 LCB CB
24006 318 Hearts 421328.0 Craig Halkett 2425 88.059701 91.304348 3.244646 1010.900000 RCB CB
5678 71 Bologna 349406.0 Nicolò Casale 2425 87.309645 90.551181 3.241536 1019.366667 LCB CB
9692 122 Boavista 444822.0 Pedro Gomes 2425 75.636364 78.761062 3.124698 1552.850000 RCB CB
46063 28583 Dynamo Makhachkala 531314.0 Idar Shumakhov 2425 74.774775 77.431907 2.657132 2423.183333 RCB CB
1553 24 Aston Villa 408946.0 Lamare Bogarde 2425 90.267640 92.857143 2.589503 1035.550000 RCB CB
23062 305 St. Johnstone 542627.0 Bozo Mikulic 2425 83.301344 85.792350 2.491006 1262.800000 LCB CB
13353 171 QPR 121613.0 Morgan Fox 2425 79.120879 81.560284 2.439405 1501.500000 LCB CB
In [39]:
#We then select the columns we want to keep from the non filtered dataframe and save it for later use
to_use = merged_df[['team_name', 'player_id', 'player_name', 'season_id', 'press_change_pass_pct']]
to_use.to_csv("underpressureproxy.csv")
In [ ]: