In [1]:
# Import necessary libraries for data manipulation, visualization, and analysis
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings("ignore")
import urllib
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.font_manager as fm
from tqdm import tqdm
from scipy.spatial import ConvexHull, QhullError
from matplotlib.path import Path
import socceraction
import socceraction.atomic.spadl as atomicspadl
import socceraction.spadl as spadl
import gc
In [2]:
# Load custom fonts for visualization
fe_regular = fm.FontEntry(
fname='/Users/davidegualano/Documents/Python FTBLData/SourceSansPro-Regular.ttf',
name='SourceSansPro-Regular'
)
fe_semibold = fm.FontEntry(
fname='/Users/davidegualano/Documents/Python FTBLData/SourceSansPro-SemiBold.ttf',
name='SourceSansPro-SemiBold'
)
# Insert both fonts into the font manager
fm.fontManager.ttflist.insert(0, fe_regular)
fm.fontManager.ttflist.insert(1, fe_semibold)
# Set the font family to the custom regular font
matplotlib.rcParams['font.family'] = fe_regular.name
In [3]:
# Load datasets from CSV files
VAEP = pd.read_csv("aVAEPactions.csv", index_col = 0) # ATOMIC VAEP values for actions
fb = pd.read_csv("teamsFOTMOB.csv", index_col = 0) # Ids mapping from FOTMOB and Whoscored
position = pd.read_csv("clustered_position.csv", index_col = 0) #File with positions cluster for players
players = pd.read_csv("players2425.csv", index_col = 0) #Players infos from games
teams = pd.read_csv("teams2425.csv", index_col = 0) #Teams in 2425 season
aactions = pd.read_csv("atomic_actions2425.csv", index_col = 0) #Event data
games0 = pd.read_csv("games2425.csv", index_col = 0) #Games infos
games = games0[["game_id", "competition_id", "season_id", "game_date"]] #Games columns I want
In [4]:
#Loading and cleaning the dataset of events which are recoveries but are not identified as such in atomic event data framework
recoveries = pd.read_csv("recoveries_id2425.csv", index_col = 0)
recoveries['original_event_id'] = recoveries['event_id'].astype(int)
recoveries = recoveries.rename(columns={'type_name' : 'is_recovery'})
recoveries = recoveries[['game_id', 'is_recovery', 'original_event_id']]
In [5]:
#Already created event dataframe with actions correspongind to player in convex hull to concatenate with new events later on
convexXX = pd.read_csv("actions_against_convex2425.csv", index_col = 0)
In [6]:
#Adding info to atomic event data
aactions = atomicspadl.add_names(aactions)
In [7]:
#Conversion of time played format into seconds
def convert_to_seconds(time_str):
try:
# Convert to string in case it's a float (e.g., NaN)
time_str = str(time_str)
# Split the time string into minutes and seconds
minutes, seconds = map(int, time_str.split(':'))
# Convert total time to seconds (minutes converted to seconds)
return minutes * 60 + seconds
except (ValueError, AttributeError):
# Handle cases where the conversion fails (e.g., NaN or bad format)
return 0 # or use `np.nan` if you prefer to mark as missing
# Apply the conversion function to the 'minutes_played' column
players['seconds_played'] = players['minutes_played'].apply(convert_to_seconds)
players['start_second'] = players['start_second'].apply(convert_to_seconds)
players['end_second'] = players['end_second'].apply(convert_to_seconds)
In [8]:
#Creating the event dataframe to work on
dfa = (
aactions
.merge(games, how="left")
.merge(recoveries, how="left")
.merge(fb, how="left")
.merge(VAEP, how="left")
)
In [9]:
#Creating the player dataframe to work on
playersX = (
players
.merge(fb, how="left")
.merge(position, how="left"))
In [10]:
#List of competition in dataset
dfa.competition_id.unique()
Out[10]:
array(['BEL-Jupiler Pro League', 'BRA-Brasileirão', 'ENG-Championship',
'ENG-FA Cup', 'ENG-League Cup', 'ENG-League One', 'ENG-League Two',
'ENG-Premier League', 'ESP-La Liga', 'EU-Champions League',
'EU-Europa League', 'FRA-Ligue 1', 'GER-Bundesliga', 'ITA-Serie A',
'NED-Eredivisie', 'POR-Liga Portugal', 'RUS-Premier League',
'SCO-Premiership', 'USA-Major League Soccer'], dtype=object)
In [11]:
#Defining the competition I'd go look for
competition = ['ITA-Serie A']
In [12]:
#Filtering the actions for specific type so to keep only those in the list and dribbles that are not recoveries
actionsX = dfa[dfa["type_name"].isin(['pass', 'receival', 'dribble', 'out', 'take_on', 'shot', 'offside', 'bad_touch', 'cross', 'goal'])]
A0 = actionsX[actionsX['is_recovery'] != 'ball recovery']
#Filtering the action on which to draw the convex hulls
CNVX0 = dfa[dfa["type_name"].isin(['pass', 'receival', 'dribble', 'out', 'take_on', 'shot', 'offside', 'bad_touch', 'cross', 'goal',
'dribble', 'interception', 'clearance', 'tackle', 'foul'])]
In [13]:
#Filtering which players we want to draw convex hulls for
playersX = players[players["seconds_played"] >= 1]
P0 = playersX[playersX['starting_position'] != 'GK']
#IF IS NECESSARY TO FILTER FOR COMPETITION AND/OR SEASON
In [14]:
#Filtering by competition
A0 = actionsX[actionsX["competition_id"].isin(competition)]
P0 = playersX[playersX["competition_id"].isin(competition)]
CNVX0 = CNVX0[CNVX0["competition_id"].isin(competition)]
In [15]:
#Further adjustements on the convex hulls events – we don't want to keep what happens in the first two third of the pitch
CNVX0 = CNVX0.merge(P0[['game_id', 'team_id', 'player_id', 'player_name']])
CNVX0 = CNVX0[CNVX0["x_a0"] <= 70]
In [16]:
# Initialize an empty list to collect the filtered events
results = []
for _, player in tqdm(P0.iterrows(), total=len(P0), desc="Processing players"):
# Extract player information
player_name = player['player_name']
player_id = player['player_id']
start_second = player['start_second']
end_second = player['end_second']
start_period = player['start_period']
end_period = player['end_period']
team_id = player['team_id']
game_id = player['game_id']
season = player['season_id']
# Filter actions for a specific game and exclude the given team_id
actions_against = A0[(A0['game_id'] == game_id) & (A0['team_id'] != team_id)]
actions_against0 = actions_against[
((actions_against['period_id'] == start_period) &
(actions_against['time_seconds'] >= start_second)) |
((actions_against['period_id'] == end_period) &
(actions_against['time_seconds'] <= end_second))]
# Add player information
actions_against0['convex_player_name'] = player_name
actions_against0['convex_player_id'] = player_id
actions_against0['convex_team_id'] = team_id
actions_against0['season_id'] = season
results.append(actions_against0)
# Combine results into a single DataFrame for this partition
final_result = pd.concat(results, ignore_index=True) if results else pd.DataFrame()
Processing players: 100%|██████████| 9031/9031 [00:07<00:00, 1129.81it/s]
In [17]:
# Group by 'played_id' and 'game_id' and then apply the filtering within each group
filtered_dfa = CNVX0.groupby(['player_id', 'player_name', 'game_id']).apply(lambda group:
group[
(group['y_a0'] >= group['y_a0'].mean() - group['y_a0'].std()) &
(group['y_a0'] <= group['y_a0'].mean() + group['y_a0'].std()) &
(group['x_a0'] >= group['x_a0'].mean() - group['x_a0'].std()) &
(group['x_a0'] <= group['x_a0'].mean() + group['x_a0'].std())
]
).reset_index(drop=True)
In [18]:
# Initialize a dictionary to store convex hull data with unique keys
convex_hull = {}
# Loop through each (player_name, team_name, game_id) to calculate the convex hulls
for (player_name, player_id, team_id, game_id), group in filtered_dfa.groupby(['player_name', 'player_id', 'team_id', 'game_id']):
points = group[['x_a0', 'y_a0']].values
if len(points) >= 3: # Convex hull requires at least 3 points
try:
# Attempt to create a convex hull if points are not collinear
hull = ConvexHull(points)
hull_points = points[hull.vertices]
hull_path = Path(hull_points) # Create a Path for hull boundary
# Use a unique key (could be a tuple of relevant identifiers)
key = (player_name, player_id, team_id, game_id)
convex_hull[key] = {
'hull_path': hull_path # Store the Path object
}
except QhullError:
# Skip the calculation if points are collinear or insufficient
continue
else:
# Skip if not enough points for a hull
continue
# Now, convex_hull_data is a dictionary with hull_path as a value.
for key, value in convex_hull.items():
hull_path = value['hull_path']
In [19]:
#Creating coordinates from the point of view of the defender
final_result['inverted_x_a0'] = 105 - final_result['x_a0']
final_result['inverted_y_a0'] = 68 - final_result['y_a0']
In [20]:
# Create a lookup dictionary for faster access
convex_hull_lookup = {}
for key, data in convex_hull.items():
convex_hull_lookup[key] = data['hull_path']
# Group data by player-game combination
grouped_data = final_result.groupby(['convex_player_id', 'convex_player_name', 'convex_team_id', 'game_id'])
# Initialize an empty list to store results
inside_convex_result = []
# Process each group
for (convex_player_id, convex_player_name, team_id, game_id), group_df in tqdm(grouped_data, desc="Processing player-game combinations"):
# Construct the key to retrieve the convex hull path
matching_key = (convex_player_name, convex_player_id, team_id, game_id)
# Check if the key exists in convex_hull_data
if matching_key in convex_hull_lookup:
hull_path = convex_hull_lookup[matching_key]
# Get the coordinates for all events as a numpy array (more efficient)
points = group_df[['inverted_x_a0', 'inverted_y_a0']].values
# Vectorized check for which points are inside the hull
if len(points) > 0:
# Using a more efficient approach if hull_path.contains_point supports arrays
try:
# Try vectorized operation if supported
inside_mask = np.array([hull_path.contains_point(point) for point in points])
if np.any(inside_mask):
inside_rows = group_df.loc[inside_mask]
inside_convex_result.append(inside_rows)
except Exception:
# Fallback to loop if vectorized operation not supported
inside_mask = [hull_path.contains_point(point) for point in points]
if any(inside_mask):
inside_rows = group_df.loc[inside_mask]
inside_convex_result.append(inside_rows)
# No need to call gc.collect() in each iteration
# Will do it periodically instead
# Periodic garbage collection (every 100 iterations)
if len(inside_convex_result) % 100 == 0 and inside_convex_result:
gc.collect()
# Final garbage collection before concat
gc.collect()
# Convert the list of DataFrames into a single DataFrame if there are results
if inside_convex_result:
inside_convex_result_df = pd.concat(inside_convex_result, ignore_index=True)
else:
inside_convex_result_df = pd.DataFrame()
Processing player-game combinations: 100%|██████████| 9031/9031 [00:21<00:00, 429.92it/s]
In [21]:
#Cleaning the dataframe before saving it
inside_convex_result_df = inside_convex_result_df.drop(['offensive_value', 'defensive_value'], axis=1).drop_duplicates()
inside_convex_result_df
Out[21]:
| game_id | original_event_id | action_id | period_id | time_seconds | team_id | player_id | x | y | dx | ... | is_recovery | team_name | fotmob_id | fotmob_name | vaep_value | convex_player_name | convex_player_id | convex_team_id | inverted_x_a0 | inverted_y_a0 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1834864 | 2.711097e+09 | 9371444 | 1 | 1177.50 | 87 | 315369.0 | 12.075 | 37.400 | 0.000 | ... | NaN | Juventus | 9885 | Juventus | 0.056515 | Pepe Reina | 2987 | 1290 | 12.075 | 37.400 |
| 1 | 1834864 | 2.711097e+09 | 9371445 | 1 | 1178.00 | 87 | 315369.0 | 12.915 | 37.264 | -12.915 | ... | NaN | Juventus | 9885 | Juventus | -0.008968 | Pepe Reina | 2987 | 1290 | 12.915 | 37.264 |
| 2 | 1834864 | 2.711097e+09 | 9371445 | 1 | 1178.00 | 87 | 315369.0 | 12.915 | 37.264 | -12.915 | ... | NaN | Juventus | 9885 | Juventus | -0.075889 | Pepe Reina | 2987 | 1290 | 12.915 | 37.264 |
| 3 | 1834864 | 2.711111e+09 | 9371959 | 1 | 2535.00 | 87 | 494525.0 | 14.595 | 35.700 | -2.310 | ... | NaN | Juventus | 9885 | Juventus | -0.032546 | Pepe Reina | 2987 | 1290 | 14.595 | 35.700 |
| 4 | 1834864 | 2.711113e+09 | 9372046 | 1 | 2718.00 | 87 | 353991.0 | 14.910 | 34.748 | 0.000 | ... | NaN | Juventus | 9885 | Juventus | 0.069313 | Pepe Reina | 2987 | 1290 | 14.910 | 34.748 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 498972 | 1835056 | 2.774332e+09 | 8836782 | 1 | 2294.00 | 77 | 149029.0 | 31.080 | 3.740 | 25.830 | ... | NaN | Lazio | 8543 | Lazio | -0.006645 | Kevin Martins | 540598 | 269 | 31.080 | 3.740 |
| 498973 | 1835033 | 2.741948e+09 | 9227027 | 2 | 221.00 | 278 | 128839.0 | 37.485 | 29.036 | -5.565 | ... | ball recovery | Genoa | 10233 | Genoa | 0.016769 | Tommaso Rubino | 544269 | 73 | 37.485 | 29.036 |
| 498974 | 1835033 | NaN | 9227627 | 2 | 1559.25 | 278 | 425115.0 | 40.950 | 25.908 | 24.465 | ... | NaN | Genoa | 10233 | Genoa | 0.000974 | Tommaso Rubino | 544269 | 73 | 40.950 | 25.908 |
| 498975 | 1835033 | 2.741955e+09 | 9227683 | 2 | 1728.00 | 278 | 317544.0 | 35.805 | 27.268 | 0.000 | ... | NaN | Genoa | 10233 | Genoa | 0.001061 | Tommaso Rubino | 544269 | 73 | 35.805 | 27.268 |
| 498976 | 1835033 | 2.741955e+09 | 9227684 | 2 | 1729.00 | 278 | 317544.0 | 35.805 | 27.268 | -3.255 | ... | NaN | Genoa | 10233 | Genoa | -0.003097 | Tommaso Rubino | 544269 | 73 | 35.805 | 27.268 |
498977 rows × 32 columns
#CONCATENATION OF OLD (CONVEXX) + NEW (inside_convex_result_df) DATA BEFORE STORING (inside_convex_result_dfC)
In [22]:
inside_convex_result_dfC = pd.concat([convexXX, inside_convex_result_df]).sort_values(by=['action_id'], ascending=True)
In [23]:
#Check of season ids
inside_convex_result_dfC.season_id.unique()
Out[23]:
array([2425, 2024])
In [24]:
#Check of competition ids
inside_convex_result_dfC.competition_id.unique()
Out[24]:
array(['BEL-Jupiler Pro League', 'BRA-Brasileirão', 'ENG-Championship',
'ENG-FA Cup', 'ENG-League Cup', 'ENG-League One', 'ENG-League Two',
'ENG-Premier League', 'ESP-La Liga', 'EU-Champions League',
'EU-Europa League', 'FRA-Ligue 1', 'GER-Bundesliga', 'ITA-Serie A',
'NED-Eredivisie', 'POR-Liga Portugal', 'RUS-Premier League',
'SCO-Premiership', 'USA-Major League Soccer'], dtype=object)
In [30]:
inside_convex_result_dfC.to_csv("actions_against_convex2425.csv")
In [ ]:
In [ ]:
In [ ]:
In [25]:
# Count occurrences of each action_id
action_count = inside_convex_result_df['action_id'].value_counts().reset_index()
action_count.columns = ['action_id', 'action_count']
# Merge the action counts with the original DataFrame
inside_convex_result_df = inside_convex_result_df.merge(action_count, on='action_id', how='left')
In [26]:
#Creating a weight for each event so that if an event is in 3 different convex hull the value/burden is shared between those 3 players
inside_convex_result_df['weight'] = 1/inside_convex_result_df['action_count']
In [27]:
V = (
inside_convex_result_df.groupby(['convex_player_name', 'convex_player_id', 'convex_team_id', 'season_id'])
.apply(lambda x: pd.Series({
'avg_goal_probability': (x['vaep_value'] * x['weight']).sum() / x['weight'].sum(),
'weighted_sum': (x['vaep_value'] * x['weight']).sum(),
'weighted_count': (x['weight']).sum()
}))
.reset_index()
)
In [28]:
# Convert 'minutes_played' to total minutes with error handling
def convert_to_minutes(time_str):
try:
# Convert to string in case it's a float (e.g., NaN)
time_str = str(time_str)
# Split the time string into minutes and seconds
minutes, seconds = map(int, time_str.split(':'))
# Convert total time to minutes (seconds converted to fraction of minutes)
return minutes + seconds / 60
except (ValueError, AttributeError):
# Handle cases where the conversion fails (e.g., NaN or bad format)
return 0 # or use `np.nan` if you prefer to mark as missing
# Apply the conversion function to the 'minutes_played' column
playersX['minutes_played_converted'] = playersX['minutes_played'].apply(convert_to_minutes)
In [29]:
#Adding minutes played for players to later filtering
mp = playersX.groupby(["player_name", "player_id", "team_id", "season_id"])["minutes_played_converted"].sum().reset_index(name='minutes_played')
mp = mp.rename(columns={
"player_name": "convex_player_name",
"player_id": "convex_player_id",
"team_id": "convex_team_id",
})
V = V.merge(mp, on=['convex_player_name', 'convex_player_id', 'convex_team_id', 'season_id'], how='left')
V = V.drop_duplicates()
Va = V[V['minutes_played'] >= 500]
In [30]:
#Creation of the metric to judge our work
Va['vaep_padj'] = (500 * Va['weighted_sum'])/Va['weighted_count']
In [31]:
#Cleaning of the table
Va = Va[['convex_player_name', 'convex_player_id', 'convex_team_id', 'season_id', 'minutes_played', 'avg_goal_probability', 'vaep_padj']]
In [32]:
# Rename the columns and further cleaning
position0 = position[['player_name', 'player_id', 'season_id', 'position']].rename(
columns={'player_name': 'convex_player_name', 'player_id': 'convex_player_id'})
Vb = Va.merge(position0).drop_duplicates()
#Exploring unique positions
Vb.position.unique()
Out[32]:
array(['LFB', 'RFB', 'LCB', 'LWB', 'AWL', 'CM', 'DM', 'RCB', 'HB', 'GK',
'RWB', 'AWR', 'SS', 'ST', 'AML', 'CB', 'AM', 'AMR'], dtype=object)
In [33]:
# Define the position group function
def map_position_group(pos):
if pos in ['CB', 'LCB', 'RCB']:
return 'CB'
elif pos in ['LWB', 'RWB', 'RFB', 'LFB']:
return 'WB'
elif pos in ['HB', 'DM', 'CM']:
return 'CDM'
elif pos in ['AMR', 'AM', 'AWL', 'AWR', 'AML']:
return 'AMW'
elif pos in ['SS', 'ST']:
return 'ST'
elif pos in ['GK']:
return 'GK'
else:
return 'Other'
# Create a new column for position groups
Vb['position_group'] = Vb['position'].apply(map_position_group)
In [34]:
#Exploring the best for our metric in a given position
Vc = Vb[Vb['position_group'].isin(['CB'])]
Vc.sort_values("vaep_padj", ascending=True).head(20)
Out[34]:
| convex_player_name | convex_player_id | convex_team_id | season_id | minutes_played | avg_goal_probability | vaep_padj | position | position_group | |
|---|---|---|---|---|---|---|---|---|---|
| 338 | Sead Kolasinac | 110260 | 300 | 2425 | 2452.450000 | 0.000955 | 0.477483 | LCB | CB |
| 296 | Pawel Dawidowicz | 121577 | 76 | 2425 | 1552.516667 | 0.001202 | 0.601100 | RCB | CB |
| 31 | Andrea Carboni | 390493 | 269 | 2425 | 1812.650000 | 0.001309 | 0.654730 | LCB | CB |
| 13 | Alessandro Bastoni | 329665 | 75 | 2425 | 2792.450000 | 0.001460 | 0.729774 | LCB | CB |
| 130 | Gianluca Mancini | 244804 | 84 | 2425 | 3381.483333 | 0.001535 | 0.767562 | RCB | CB |
| 283 | Odilon Kossounou | 384516 | 300 | 2425 | 1075.450000 | 0.001612 | 0.806060 | RCB | CB |
| 50 | Berat Djimsiti | 100962 | 300 | 2425 | 2828.000000 | 0.001686 | 0.842839 | CB | CB |
| 378 | Yann Bisseck | 349126 | 75 | 2425 | 1707.950000 | 0.001789 | 0.894334 | RCB | CB |
| 70 | Daniele Ghilardi | 486744 | 76 | 2425 | 1431.416667 | 0.001795 | 0.897595 | LCB | CB |
| 48 | Benjamin Pavard | 259648 | 75 | 2425 | 2230.200000 | 0.001829 | 0.914666 | RCB | CB |
| 131 | Giorgio Altare | 338476 | 85 | 2425 | 840.766667 | 0.001849 | 0.924484 | RCB | CB |
| 61 | Christian Kabasele | 68393 | 86 | 2425 | 1004.166667 | 0.001869 | 0.934338 | RCB | CB |
| 322 | Saba Goglichidze | 498982 | 272 | 2425 | 2161.750000 | 0.001936 | 0.967846 | RCB | CB |
| 226 | Mario Hermoso | 344156 | 84 | 2425 | 628.033333 | 0.001978 | 0.988878 | LCB | CB |
| 240 | Mattia De Sciglio | 107275 | 272 | 2425 | 859.216667 | 0.002022 | 1.010881 | RCB | CB |
| 15 | Alessandro Buongiorno | 333373 | 276 | 2425 | 1922.033333 | 0.002061 | 1.030597 | LCB | CB |
| 358 | Thomas Kristensen | 440029 | 86 | 2425 | 1328.050000 | 0.002064 | 1.032159 | RCB | CB |
| 55 | Bremer | 339875 | 87 | 2425 | 678.650000 | 0.002079 | 1.039396 | LCB | CB |
| 198 | Lloyd Kelly | 332419 | 87 | 2425 | 641.350000 | 0.002125 | 1.062487 | LCB | CB |
| 301 | Pierre Kalulu | 391836 | 87 | 2425 | 2587.600000 | 0.002137 | 1.068335 | LCB | CB |
In [35]:
#Visualizing actions that were assigned to a player in his convex hulls
from mplsoccer import Pitch, VerticalPitch, lines
import matplotlib.colors as mcolors
#Selecting player
filtered_events_df1 = inside_convex_result_df[inside_convex_result_df['convex_player_id'] == 100962]
#Creating the colormap
cmap = matplotlib.colormaps.get_cmap('bwr')
vmin = -0.006 # or set manually, e.g., vmin = 0
vmax = 0.019 # or set manually, e.g., vmax = 1
# Normalize the data to the specified range
norm = mcolors.Normalize(vmin=vmin, vmax=vmax)
# Apply the colormap and normalization to your data
colors = cmap(norm(filtered_events_df1['vaep_value']))
#Plotting the data
pitch = Pitch(pitch_type='custom', pitch_width=68, pitch_length=105, goal_type='box', linewidth=2, line_color='black', half=False)
bins = (6, 5)
fig, ax = pitch.draw(figsize=(16, 11), constrained_layout=True, tight_layout=False)
# plot the heatmap - darker colors = more passes originating from that square
cmap = mcolors.LinearSegmentedColormap.from_list("custom_red", ["#D7D1CF", "#FF0000"])
bs_heatmap = pitch.bin_statistic(filtered_events_df1.inverted_x_a0, filtered_events_df1.inverted_y_a0, statistic='count', bins=bins)
hm = pitch.heatmap(bs_heatmap, ax=ax, cmap=cmap, zorder = 3, alpha = 0.5)
scatter = pitch.scatter(filtered_events_df1.inverted_x_a0, filtered_events_df1.inverted_y_a0, ax=ax, edgecolor='#000000', facecolor=colors, s=50)
In [ ]:
In [ ]:
In [ ]: