In [1]:
#Import necessary packages
import os
import warnings
from tqdm import tqdm
import socceraction.spadl as spadl
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
warnings.simplefilter(action='ignore', category=pd.errors.PerformanceWarning)
In [2]:
#Import all the necessary dataframes
xP = pd.read_csv("xPactions.csv", index_col = 0)
fb = pd.read_csv("teamsFOTMOB.csv", index_col = 0)
positions = pd.read_csv("clustered_position.csv", index_col = 0)
In [3]:
players0 = pd.read_csv("players2021.csv", index_col = 0)
players1 = pd.read_csv("players2122.csv", index_col = 0)
players2 = pd.read_csv("players2223.csv", index_col = 0)
players3 = pd.read_csv("players2324.csv", index_col = 0)
players4 = pd.read_csv("players2425.csv", index_col = 0)
In [4]:
games0 = pd.read_csv("games2021.csv", index_col = 0)
games1 = pd.read_csv("games2122.csv", index_col = 0)
games2 = pd.read_csv("games2223.csv", index_col = 0)
games3 = pd.read_csv("games2324.csv", index_col = 0)
games4 = pd.read_csv("games2425.csv", index_col = 0)
In [5]:
actions0 = pd.read_csv("actions2021.csv", index_col = 0)
actions1 = pd.read_csv("actions2122.csv", index_col = 0)
actions2 = pd.read_csv("actions2223.csv", index_col = 0)
actions3 = pd.read_csv("actions2324.csv", index_col = 0)
actions4 = pd.read_csv("actions2425.csv", index_col = 0)
In [6]:
#Concat together the same dataframes from different seasons
players = pd.concat([players0, players1, players2, players3, players4])
games = pd.concat([games0, games1, games2, games3, games4])
actions = pd.concat([actions0, actions1, actions2, actions3, actions4])
In [7]:
#Adjust the action_id of the action dataframe as it started from 1 with each session
actions.drop(columns=['action_id'], inplace=True)
actions.reset_index(drop=True, inplace=True)
actions.reset_index(inplace=True)
actions.rename(columns={'index': 'action_id'}, inplace=True)
In [8]:
# Convert 'minutes_played' to total minutes with error handling
def convert_to_minutes(time_str):
try:
# Convert to string in case it's a float (e.g., NaN)
time_str = str(time_str)
# Split the time string into minutes and seconds
minutes, seconds = map(int, time_str.split(':'))
# Convert total time to minutes (seconds converted to fraction of minutes)
return minutes + seconds / 60
except (ValueError, AttributeError):
# Handle cases where the conversion fails (e.g., NaN or bad format)
return 0 # or use `np.nan` if you prefer to mark as missing
# Apply the conversion function to the 'minutes_played' column
players['minutes_played_converted'] = players['minutes_played'].apply(convert_to_minutes)
In [9]:
#Merging together players with identifiers and names for teams
players = players.merge(fb, how="left")
In [10]:
#Creating a table to have minutes played for players in each season and team
mp = players.groupby(["player_name", "player_id", "team_name",
"fotmob_id", "season_id"])["minutes_played_converted"].sum().reset_index(name='minutes_played')
In [11]:
#We filter the games dataframe to contain only certain informations
games0 = games[["game_id", "competition_id", "season_id"]]
In [12]:
#Using the spadl/vaep framework to add names on the ids in columns
actions0 = spadl.add_names(actions)
In [13]:
#Merging together actions + games info + players and teams infos + xP values
actions1 = (actions0
.merge(xP, how="left")
.merge(fb, how="left")
.merge(games0, how="left"))
In [14]:
#We also merge the minutes played in the season for the player on the actions
actions = actions1.merge(mp, how="left")
In [15]:
#Creating a column of outcome for successful actions (they are flagged as 1)
actions['outcome'] = np.where((actions["result_name"] == 'success'), True, False)
In [16]:
#We do the necessary calculation to calculate the PPDA each team faced from opposition in each game
#We first store for each game how many passes a team attempted in theirs first 60% of the pitch
passes = actions[actions['type_name'] == 'pass']
passes0 = passes[passes['start_x'] <= 63]
X0 = passes0.groupby(["team_id", "team_name", "game_id", "competition_id", "season_id"])["type_name"].count().reset_index(name='passes')
#We then store how many defensive actions a team attempted in the last 60% of the pitch
defactions0 = actions[actions['type_name'].isin(['tackle', 'foul', 'interception'])]
defactions = defactions0[defactions0['start_x'] >= 42]
X1 = defactions.groupby(["team_id", "team_name", "game_id", "competition_id", "season_id"])["type_name"].count().reset_index(name='defactions')
#This two will be the necessary values to find the PPDA of the opponents as we want to know
#how many passes per defensive actions the team was allowed to attempt
In [17]:
# Step 1: Create a dictionary to hold game_id to defensive actions of each team
game_teams = X0.groupby('game_id').apply(lambda x: x[['passes', 'team_id', 'team_name',]].to_dict('records')).to_dict()
# Step 2: Define a function to get both opponent defactions and team_id/name
def get_opponent_info(row):
game_id = row['game_id']
team_id = row['team_id']
# Get all teams in the current game
teams_in_game = game_teams[game_id]
# Find the opponent team (the team that isn't the current one)
opponent = [team for team in teams_in_game if team['team_id'] != team_id]
# If we found an opponent, return both defactions and team_id/team_name
if opponent:
opponent = opponent[0]
return opponent['passes'], opponent['team_id'], opponent['team_name']
return None, None, None
# Step 3: Apply the function to get opponent defactions
X0['opponent_passes'], X0['opponent_id'], X0['opponent'], = zip(*X0.apply(get_opponent_info, axis=1))
/var/folders/ns/3wxdg4g57h77vxwmr4wzmvt40000gn/T/ipykernel_8484/2710470026.py:2: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning. game_teams = X0.groupby('game_id').apply(lambda x: x[['passes', 'team_id', 'team_name',]].to_dict('records')).to_dict()
In [18]:
# Step 1: Create a dictionary to hold game_id to both team_id and defactions
game_teams = X1.groupby('game_id').apply(lambda x: x[['defactions', 'team_id', 'team_name',]].to_dict('records')).to_dict()
# Step 2: Define a function to get both opponent defactions and team_id/name
def get_opponent_info(row):
game_id = row['game_id']
team_id = row['team_id']
# Get all teams in the current game
teams_in_game = game_teams[game_id]
# Find the opponent team (the team that isn't the current one)
opponent = [team for team in teams_in_game if team['team_id'] != team_id]
# If we found an opponent, return both defactions and team_id/team_name
if opponent:
opponent = opponent[0]
return opponent['defactions'], opponent['team_id'], opponent['team_name']
return None, None, None
# Step 3: Apply the function and unpack the results into three new columns
X1['opponent_defactions'], X1['opponent_id'], X1['opponent'], = zip(*X1.apply(get_opponent_info, axis=1))
/var/folders/ns/3wxdg4g57h77vxwmr4wzmvt40000gn/T/ipykernel_8484/2738209691.py:2: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning. game_teams = X1.groupby('game_id').apply(lambda x: x[['defactions', 'team_id', 'team_name',]].to_dict('records')).to_dict()
In [19]:
# Perform a left merge so we have a table with both passes and defensive actions for team and opponents
X = X0.merge(X1, how="left", on=["team_id", "team_name", "game_id", "competition_id", "season_id", "opponent_id", "opponent"])
In [20]:
X
Out[20]:
team_id | team_name | game_id | competition_id | season_id | passes | opponent_passes | opponent_id | opponent | defactions | opponent_defactions | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | Brondby IF | 1592218 | EU-Europa League | 2122 | 208 | 246.0 | 354.0 | Sparta Prague | 32.0 | 34.0 |
1 | 1 | Brondby IF | 1592220 | EU-Europa League | 2122 | 130 | 509.0 | 228.0 | Lyon | 39.0 | 18.0 |
2 | 1 | Brondby IF | 1592222 | EU-Europa League | 2122 | 314 | 366.0 | 306.0 | Rangers | 33.0 | 32.0 |
3 | 1 | Brondby IF | 1592224 | EU-Europa League | 2122 | 179 | 299.0 | 306.0 | Rangers | 26.0 | 26.0 |
4 | 1 | Brondby IF | 1592226 | EU-Europa League | 2122 | 161 | 416.0 | 228.0 | Lyon | 22.0 | 23.0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
59797 | 30664 | St. Louis City | 1791167 | USA-Major League Soccer | 2024 | 166 | 289.0 | 1122.0 | San Jose | 32.0 | 27.0 |
59798 | 30664 | St. Louis City | 1791196 | USA-Major League Soccer | 2024 | 219 | 205.0 | 11134.0 | Vancouver | 24.0 | 30.0 |
59799 | 30664 | St. Louis City | 1791214 | USA-Major League Soccer | 2024 | 149 | 326.0 | 3624.0 | Houston | 35.0 | 30.0 |
59800 | 30664 | St. Louis City | 1791224 | USA-Major League Soccer | 2024 | 222 | 232.0 | 11134.0 | Vancouver | 28.0 | 34.0 |
59801 | 30664 | St. Louis City | 1791466 | USA-Major League Soccer | 2024 | 358 | 195.0 | 9293.0 | Minnesota United | 26.0 | 28.0 |
59802 rows × 11 columns
In [21]:
#We calculate for each game both the team PPDA and opponent's
X['opponent_PPDA'] = X['passes'] / X['opponent_defactions']
X['PPDA'] = X['opponent_passes'] / X['defactions']
In [22]:
#We add dates to the table so we can visualize how intense opponents have been over time (if we want)
dates = games[['game_id', 'game_date']]
Ya = (X.merge(dates, how="left"))
In [23]:
#We filter the table to have the final one
Y = Ya[['team_id', 'team_name', 'game_id', 'game_date', 'season_id', 'opponent_id', 'opponent', 'opponent_PPDA']]
In [24]:
#We find the lowest 30% of intense PPDA so to have a threshold (would be the 30% most intense)
quantiles = Y.groupby('team_id')['opponent_PPDA'].quantile(0.3).reset_index()
# Rename the column to indicate it's the threshold
quantiles = quantiles.rename(columns={'opponent_PPDA': 'opponent_PPDA_thrs'})
# Merge the threshold data back into the original DataFrame
Z = Y.merge(quantiles, on='team_id', how='left')
In [25]:
# Filter for team_name and sort by game_date for the visualization
df_team = Z[Z['team_name'] == 'Juventus'].sort_values(by='game_date')
# Plotting
plt.figure(figsize=(20,10))
plt.plot(df_team['game_date'], df_team['opponent_PPDA'], marker='o', linestyle='-', color='grey')
quantile_value = df_team['opponent_PPDA_thrs'].iloc[0]
plt.axhline(y=quantile_value, color='red', linestyle='--', zorder=2)
# Adding labels and title
plt.xlabel('Game Date')
plt.ylabel('Opponent PPDA')
plt.title(f'Opponent PPDA for {df_team["team_name"].iloc[0]} from {df_team["season_id"].min()} to {df_team["season_id"].max()}')
# Annotate each point with the opponent's name
for i in range(len(df_team)):
plt.annotate(df_team['opponent'].iloc[i],
(df_team['game_date'].iloc[i], df_team['opponent_PPDA'].iloc[i]),
textcoords="offset points", xytext=(0,5), ha='center')
# Invert the y-axis
plt.gca().invert_yaxis()
# Reduce number of x-axis ticks
n = 10 # Show every 3rd date, adjust as needed
plt.xticks(df_team['game_date'][::n], rotation=45)
# Show plot
plt.xticks(rotation=90)
plt.tight_layout()
plt.show()
In [ ]:
In [ ]:
In [ ]:
In [26]:
#We now do the necessary calculations to see how players passing % change against pressing intensive teams
#and use that as a proxy of player ability under pressure
In [27]:
#We calculate number of attempted and successful passes per game for players
passes['PAx'] = passes['outcome'] - passes['xP']
P0 = passes.groupby(["team_id", "team_name", "player_id", "player_name", "game_id",
"season_id"])["xP"].sum().reset_index()
P1 = (passes.groupby(["team_id", "team_name", "player_id", "player_name", "game_id",
"season_id"], observed = True)["type_name"].count().reset_index(name='attempted_passes'))
passes1 = passes[passes["result_name"] == 'success']
P2 = (passes1.groupby(["team_id", "team_name", "player_id", "player_name", "game_id",
"season_id"], observed = True)["type_name"].count().reset_index(name='successful_passes'))
/var/folders/ns/3wxdg4g57h77vxwmr4wzmvt40000gn/T/ipykernel_8484/4274909673.py:2: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy passes['PAx'] = passes['outcome'] - passes['xP']
In [28]:
#we merge to have a single dataframe and then fillna
P = (P0
.merge(P1, how="left")
.merge(P2, how="left"))
P[['xP', 'attempted_passes', 'successful_passes']] = P[['xP', 'attempted_passes', 'successful_passes']].fillna(0)
In [29]:
#We filter down the precedently made table and merge it with cluster positions
mpa = mp[['player_name', 'player_id', 'team_name', 'season_id', 'minutes_played']]
mpa = mpa.merge(positions, how='left')
In [30]:
#Then we merge it on the table of players passes attempted and completed
finalP = P.merge(mpa, how="left").merge(Y, how="left")
finalP
Out[30]:
team_id | team_name | player_id | player_name | game_id | season_id | xP | attempted_passes | successful_passes | minutes_played | position | position_group | game_date | opponent_id | opponent | opponent_PPDA | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | Brondby IF | 108942.0 | Josip Radosevic | 1592218 | 2122 | 41.013142 | 49 | 41.0 | 456.700000 | CM | CDM | 2021-09-16 20:00:00 | 354.0 | Sparta Prague | 6.117647 |
1 | 1 | Brondby IF | 108942.0 | Josip Radosevic | 1592220 | 2122 | 28.089613 | 35 | 27.0 | 456.700000 | CM | CDM | 2021-09-30 17:45:00 | 228.0 | Lyon | 7.222222 |
2 | 1 | Brondby IF | 108942.0 | Josip Radosevic | 1592222 | 2122 | 26.208827 | 32 | 26.0 | 456.700000 | CM | CDM | 2021-10-21 20:00:00 | 306.0 | Rangers | 9.812500 |
3 | 1 | Brondby IF | 108942.0 | Josip Radosevic | 1592224 | 2122 | 43.584918 | 53 | 39.0 | 456.700000 | CM | CDM | 2021-11-04 17:45:00 | 306.0 | Rangers | 6.884615 |
4 | 1 | Brondby IF | 108942.0 | Josip Radosevic | 1592226 | 2122 | 28.035128 | 35 | 27.0 | 456.700000 | CM | CDM | 2021-11-25 20:00:00 | 228.0 | Lyon | 7.000000 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
891915 | 30664 | St. Louis City | 533552.0 | Jake Girdwood-Reich | 1791090 | 2024 | 2.507253 | 4 | 2.0 | 195.083333 | DM | CDM | 2024-08-25 03:30:00 | 11133.0 | Portland | 6.533333 |
891916 | 30664 | St. Louis City | 533552.0 | Jake Girdwood-Reich | 1791129 | 2024 | 2.120877 | 3 | 3.0 | 195.083333 | DM | CDM | 2024-09-29 01:30:00 | 1116.0 | Kansas City | 5.909091 |
891917 | 30664 | St. Louis City | 533552.0 | Jake Girdwood-Reich | 1791146 | 2024 | 33.403613 | 40 | 35.0 | 195.083333 | DM | CDM | 2024-10-03 03:30:00 | 27482.0 | Los Angeles FC | 9.147059 |
891918 | 30664 | St. Louis City | 533552.0 | Jake Girdwood-Reich | 1791167 | 2024 | 4.820073 | 6 | 6.0 | 195.083333 | DM | CDM | 2024-09-22 03:30:00 | 1122.0 | San Jose | 6.148148 |
891919 | 30664 | St. Louis City | 533552.0 | Jake Girdwood-Reich | 1791466 | 2024 | 0.888112 | 1 | 1.0 | 195.083333 | DM | CDM | 2024-10-20 02:00:00 | 9293.0 | Minnesota United | 12.785714 |
891920 rows × 16 columns
In [31]:
#We find the lowest 30% of intense PPDA of oppontent's for players in season
quantiles = finalP.groupby(['team_id', 'player_id', 'season_id'])['opponent_PPDA'].quantile(0.3).reset_index()
# Rename the column to indicate it's the 20th percentile
quantiles = quantiles.rename(columns={'opponent_PPDA': 'opponent_PPDA_thrs'})
In [32]:
#We merge the threshold on the table
finalP = finalP.merge(quantiles, how='left')
In [33]:
# Step 1: Categorize matches based on PPDA threshold
finalP['ppda_category'] = finalP['opponent_PPDA'].le(finalP['opponent_PPDA_thrs']) # True if PPDA ≤ threshold, else False
# Step 2: Group and aggregate
aggregated = (
finalP.groupby(['team_id', 'team_name', 'player_id', 'player_name', 'season_id', 'ppda_category'])
.agg(
xP_sum=('xP', 'sum'),
attempted_passes_sum=('attempted_passes', 'sum'),
successful_passes_sum=('successful_passes', 'sum')
)
.reset_index()
)
# Step 3: Separate results into "low-PPDA" and "high-PPDA"
low_ppda = aggregated[aggregated['ppda_category'] == True]
high_ppda = aggregated[aggregated['ppda_category'] == False]
In [34]:
#We calculate expected and actual percentage both for games below and over the threshold
high_ppda['xP%'] = (high_ppda['xP_sum'] / high_ppda['attempted_passes_sum']) * 100
high_ppda['Passes %'] = (high_ppda['successful_passes_sum'] /high_ppda['attempted_passes_sum']) * 100
low_ppda['xP%'] = (low_ppda['xP_sum'] / low_ppda['attempted_passes_sum']) * 100
low_ppda['Passes %'] = (low_ppda['successful_passes_sum'] /low_ppda['attempted_passes_sum']) * 100
/var/folders/ns/3wxdg4g57h77vxwmr4wzmvt40000gn/T/ipykernel_8484/3043594213.py:2: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy high_ppda['xP%'] = (high_ppda['xP_sum'] / high_ppda['attempted_passes_sum']) * 100 /var/folders/ns/3wxdg4g57h77vxwmr4wzmvt40000gn/T/ipykernel_8484/3043594213.py:3: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy high_ppda['Passes %'] = (high_ppda['successful_passes_sum'] /high_ppda['attempted_passes_sum']) * 100 /var/folders/ns/3wxdg4g57h77vxwmr4wzmvt40000gn/T/ipykernel_8484/3043594213.py:5: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy low_ppda['xP%'] = (low_ppda['xP_sum'] / low_ppda['attempted_passes_sum']) * 100 /var/folders/ns/3wxdg4g57h77vxwmr4wzmvt40000gn/T/ipykernel_8484/3043594213.py:6: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy low_ppda['Passes %'] = (low_ppda['successful_passes_sum'] /low_ppda['attempted_passes_sum']) * 100
In [35]:
#We filter both dataframe for below and over the threshold
high_ppda = high_ppda[['team_id', 'team_name', 'player_id', 'player_name', 'season_id', 'Passes %']]
low_ppda = low_ppda[['team_id', 'team_name', 'player_id', 'player_name', 'season_id', 'Passes %']]
In [36]:
# Renaming the `Passes %` column to distinguish between the two dataframes
df1 = high_ppda.rename(columns={'Passes %': 'Passes %'})
df2 = low_ppda.rename(columns={'Passes %': 'Pressured_Passes %'})
# We then merge the two
merged_df = df1.merge(df2)
# Finally we calculate the difference the player have under pressure
merged_df['press_change_pass_pct'] = merged_df['Pressured_Passes %'] - merged_df['Passes %']
In [37]:
#To explore the result we merge the players info dataframe and look at what positions we have
merged_df = merged_df.merge(mpa, how='left')
merged_df.position_group.unique()
Out[37]:
array(['CDM', 'CB', 'Undefined', 'AMW', 'GK', 'WB', 'ST', nan], dtype=object)
In [38]:
#We then filter for player with at least 1000 minutes played in 2425, go for the best under pressure to list and select position
X = merged_df.sort_values(by=['press_change_pass_pct'], ascending=False)
X = X[(X['minutes_played'] > 1000) & (X['season_id'] == 2425)]
X[X['position_group'].isin(['CB'])].head(20)
Out[38]:
team_id | team_name | player_id | player_name | season_id | Passes % | Pressured_Passes % | press_change_pass_pct | minutes_played | position | position_group | |
---|---|---|---|---|---|---|---|---|---|---|---|
27736 | 838 | Las Palmas | 497198.0 | Juan Herzog | 2425 | 82.558140 | 92.307692 | 9.749553 | 1122.483333 | RCB | CB |
38175 | 2157 | Accrington | 432338.0 | Zach Awe | 2425 | 62.696629 | 71.084337 | 8.387708 | 2281.816667 | LCB | CB |
16149 | 203 | Lincoln City | 144375.0 | Adam Jackson | 2425 | 67.073171 | 73.157895 | 6.084724 | 2423.500000 | RCB | CB |
8125 | 98 | Exeter | 134174.0 | Ben Purrington | 2425 | 67.936508 | 73.303167 | 5.366659 | 1215.633333 | LCB | CB |
1303 | 22 | Bradford | 469637.0 | Ciarán Kelly | 2425 | 65.237020 | 70.588235 | 5.351215 | 1514.066667 | LCB | CB |
19758 | 258 | Cheltenham | 382792.0 | Ibrahim Bakare | 2425 | 55.379747 | 60.447761 | 5.068014 | 1856.900000 | RCB | CB |
1585 | 25 | Sheff Wed | 83778.0 | Liam Palmer | 2425 | 84.352078 | 89.285714 | 4.933636 | 1259.033333 | RCB | CB |
9872 | 128 | FC Utrecht | 101605.0 | Mike van der Hoorn | 2425 | 80.915287 | 85.675676 | 4.760388 | 2488.600000 | RCB | CB |
31815 | 990 | Stevenage | 134494.0 | Nathan Thompson | 2425 | 80.625000 | 85.234899 | 4.609899 | 1366.733333 | RCB | CB |
38554 | 2188 | Estoril | 436897.0 | Pedro Álvaro | 2425 | 84.417178 | 88.857939 | 4.440761 | 2624.666667 | RCB | CB |
12370 | 163 | Sheff Utd | 312670.0 | Hamza Choudhury | 2425 | 81.081081 | 85.401460 | 4.320379 | 1111.950000 | RCB | CB |
3277 | 41 | Stuttgart | 416149.0 | Anthony Rouault | 2425 | 89.125296 | 92.822967 | 3.697671 | 1970.500000 | RCB | CB |
15523 | 193 | Cambridge U | 374630.0 | Kelland Watts | 2425 | 59.449541 | 62.978723 | 3.529182 | 2294.433333 | LCB | CB |
24006 | 318 | Hearts | 421328.0 | Craig Halkett | 2425 | 88.059701 | 91.304348 | 3.244646 | 1010.900000 | RCB | CB |
5678 | 71 | Bologna | 349406.0 | Nicolò Casale | 2425 | 87.309645 | 90.551181 | 3.241536 | 1019.366667 | LCB | CB |
9692 | 122 | Boavista | 444822.0 | Pedro Gomes | 2425 | 75.636364 | 78.761062 | 3.124698 | 1552.850000 | RCB | CB |
46063 | 28583 | Dynamo Makhachkala | 531314.0 | Idar Shumakhov | 2425 | 74.774775 | 77.431907 | 2.657132 | 2423.183333 | RCB | CB |
1553 | 24 | Aston Villa | 408946.0 | Lamare Bogarde | 2425 | 90.267640 | 92.857143 | 2.589503 | 1035.550000 | RCB | CB |
23062 | 305 | St. Johnstone | 542627.0 | Bozo Mikulic | 2425 | 83.301344 | 85.792350 | 2.491006 | 1262.800000 | LCB | CB |
13353 | 171 | QPR | 121613.0 | Morgan Fox | 2425 | 79.120879 | 81.560284 | 2.439405 | 1501.500000 | LCB | CB |
In [39]:
#We then select the columns we want to keep from the non filtered dataframe and save it for later use
to_use = merged_df[['team_name', 'player_id', 'player_name', 'season_id', 'press_change_pass_pct']]
to_use.to_csv("underpressureproxy.csv")
In [ ]: