In [1]:
# Import necessary libraries for data manipulation, visualization, and analysis
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings("ignore")
import urllib
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.font_manager as fm
from tqdm import tqdm
from scipy.spatial import ConvexHull, QhullError
from matplotlib.path import Path
import socceraction
import socceraction.atomic.spadl as atomicspadl
import socceraction.spadl as spadl
import gc
In [2]:
# Load custom fonts for visualization
fe_regular = fm.FontEntry(
    fname='/Users/davidegualano/Documents/Python FTBLData/SourceSansPro-Regular.ttf',
    name='SourceSansPro-Regular'
)
fe_semibold = fm.FontEntry(
    fname='/Users/davidegualano/Documents/Python FTBLData/SourceSansPro-SemiBold.ttf',
    name='SourceSansPro-SemiBold'
)

# Insert both fonts into the font manager
fm.fontManager.ttflist.insert(0, fe_regular)
fm.fontManager.ttflist.insert(1, fe_semibold)

# Set the font family to the custom regular font
matplotlib.rcParams['font.family'] = fe_regular.name
In [3]:
# Load datasets from CSV files
VAEP = pd.read_csv("aVAEPactions.csv", index_col = 0) # ATOMIC VAEP values for actions
fb = pd.read_csv("teamsFOTMOB.csv", index_col = 0) # Ids mapping from FOTMOB and Whoscored
position = pd.read_csv("clustered_position.csv", index_col = 0) #File with positions cluster for players
players = pd.read_csv("players2425.csv", index_col = 0) #Players infos from games
teams = pd.read_csv("teams2425.csv", index_col = 0) #Teams in 2425 season
aactions = pd.read_csv("atomic_actions2425.csv", index_col = 0) #Event data 
games0 = pd.read_csv("games2425.csv", index_col = 0) #Games infos
games = games0[["game_id", "competition_id", "season_id", "game_date"]] #Games columns I want
In [4]:
#Loading and cleaning the dataset of events which are recoveries but are not identified as such in atomic event data framework
recoveries = pd.read_csv("recoveries_id2425.csv", index_col = 0)
recoveries['original_event_id'] = recoveries['event_id'].astype(int)
recoveries = recoveries.rename(columns={'type_name' : 'is_recovery'})
recoveries = recoveries[['game_id', 'is_recovery', 'original_event_id']]
In [5]:
#Already created event dataframe with actions correspongind to player in convex hull to concatenate with new events later on
convexXX = pd.read_csv("actions_against_convex2425.csv", index_col = 0)
In [6]:
#Adding info to atomic event data
aactions = atomicspadl.add_names(aactions)
In [7]:
#Conversion of time played format into seconds
def convert_to_seconds(time_str):
    try:
        # Convert to string in case it's a float (e.g., NaN)
        time_str = str(time_str)
        # Split the time string into minutes and seconds
        minutes, seconds = map(int, time_str.split(':'))
        # Convert total time to seconds (minutes converted to seconds)
        return minutes * 60 + seconds
    except (ValueError, AttributeError):
        # Handle cases where the conversion fails (e.g., NaN or bad format)
        return 0  # or use `np.nan` if you prefer to mark as missing

# Apply the conversion function to the 'minutes_played' column
players['seconds_played'] = players['minutes_played'].apply(convert_to_seconds)
players['start_second'] = players['start_second'].apply(convert_to_seconds)
players['end_second'] = players['end_second'].apply(convert_to_seconds)
In [8]:
#Creating the event dataframe to work on
dfa = (
    aactions
    .merge(games, how="left")
    .merge(recoveries, how="left")
    .merge(fb, how="left")
    .merge(VAEP, how="left")
    )
In [9]:
#Creating the player dataframe to work on
playersX = (
    players
    .merge(fb, how="left")
    .merge(position, how="left"))
In [10]:
#List of competition in dataset
dfa.competition_id.unique()
Out[10]:
array(['BEL-Jupiler Pro League', 'BRA-Brasileirão', 'ENG-Championship',
       'ENG-FA Cup', 'ENG-League Cup', 'ENG-League One', 'ENG-League Two',
       'ENG-Premier League', 'ESP-La Liga', 'EU-Champions League',
       'EU-Europa League', 'FRA-Ligue 1', 'GER-Bundesliga', 'ITA-Serie A',
       'NED-Eredivisie', 'POR-Liga Portugal', 'RUS-Premier League',
       'SCO-Premiership', 'USA-Major League Soccer'], dtype=object)
In [11]:
#Defining the competition I'd go look for
competition = ['ITA-Serie A']
In [12]:
#Filtering the actions for specific type so to keep only those in the list and dribbles that are not recoveries
actionsX = dfa[dfa["type_name"].isin(['pass', 'receival', 'dribble', 'out',  'take_on', 'shot', 'offside', 'bad_touch', 'cross', 'goal'])]
A0 = actionsX[actionsX['is_recovery'] != 'ball recovery']

#Filtering the action on which to draw the convex hulls
CNVX0 = dfa[dfa["type_name"].isin(['pass', 'receival', 'dribble', 'out',  'take_on', 'shot', 'offside', 'bad_touch', 'cross', 'goal',
                                 'dribble', 'interception', 'clearance', 'tackle', 'foul'])]
In [13]:
#Filtering which players we want to draw convex hulls for
playersX = players[players["seconds_played"] >= 1]
P0 = playersX[playersX['starting_position'] != 'GK']

#IF IS NECESSARY TO FILTER FOR COMPETITION AND/OR SEASON

In [14]:
#Filtering by competition
A0 = actionsX[actionsX["competition_id"].isin(competition)]
P0 = playersX[playersX["competition_id"].isin(competition)]
CNVX0 = CNVX0[CNVX0["competition_id"].isin(competition)]
In [15]:
#Further adjustements on the convex hulls events – we don't want to keep what happens in the first two third of the pitch
CNVX0 = CNVX0.merge(P0[['game_id', 'team_id', 'player_id', 'player_name']])
CNVX0 = CNVX0[CNVX0["x_a0"] <= 70]
In [16]:
# Initialize an empty list to collect the filtered events
results = []

for _, player in tqdm(P0.iterrows(), total=len(P0), desc="Processing players"):
   # Extract player information
    player_name = player['player_name']
    player_id = player['player_id']
    start_second = player['start_second']
    end_second = player['end_second']
    start_period = player['start_period']
    end_period = player['end_period']
    team_id = player['team_id']
    game_id = player['game_id']  
    season = player['season_id']  

   # Filter actions for a specific game and exclude the given team_id
    actions_against = A0[(A0['game_id'] == game_id) & (A0['team_id'] != team_id)]
    
    actions_against0 = actions_against[
        ((actions_against['period_id'] == start_period) & 
         (actions_against['time_seconds'] >= start_second)) |
        ((actions_against['period_id'] == end_period) & 
         (actions_against['time_seconds'] <= end_second))]
        
    # Add player information
    actions_against0['convex_player_name'] = player_name
    actions_against0['convex_player_id'] = player_id
    actions_against0['convex_team_id'] = team_id
    actions_against0['season_id'] = season

    results.append(actions_against0)

# Combine results into a single DataFrame for this partition
final_result = pd.concat(results, ignore_index=True) if results else pd.DataFrame()
Processing players: 100%|██████████| 9031/9031 [00:07<00:00, 1129.81it/s]
In [17]:
# Group by 'played_id' and 'game_id' and then apply the filtering within each group
filtered_dfa = CNVX0.groupby(['player_id', 'player_name', 'game_id']).apply(lambda group: 
    group[
        (group['y_a0'] >= group['y_a0'].mean() - group['y_a0'].std()) &
        (group['y_a0'] <= group['y_a0'].mean() + group['y_a0'].std()) &
        (group['x_a0'] >= group['x_a0'].mean() - group['x_a0'].std()) &
        (group['x_a0'] <= group['x_a0'].mean() + group['x_a0'].std())
    ]
).reset_index(drop=True)
In [18]:
# Initialize a dictionary to store convex hull data with unique keys
convex_hull = {}

# Loop through each (player_name, team_name, game_id) to calculate the convex hulls
for (player_name, player_id, team_id, game_id), group in filtered_dfa.groupby(['player_name', 'player_id', 'team_id', 'game_id']):
    points = group[['x_a0', 'y_a0']].values
    
    if len(points) >= 3:  # Convex hull requires at least 3 points
        try:
            # Attempt to create a convex hull if points are not collinear
            hull = ConvexHull(points)
            hull_points = points[hull.vertices]
            hull_path = Path(hull_points)  # Create a Path for hull boundary

            # Use a unique key (could be a tuple of relevant identifiers)
            key = (player_name, player_id, team_id, game_id)
            convex_hull[key] = {
                'hull_path': hull_path  # Store the Path object
            }

        except QhullError:
            # Skip the calculation if points are collinear or insufficient
            continue
    else:
        # Skip if not enough points for a hull
        continue

# Now, convex_hull_data is a dictionary with hull_path as a value.
for key, value in convex_hull.items():
    hull_path = value['hull_path']
In [19]:
#Creating coordinates from the point of view of the defender
final_result['inverted_x_a0'] = 105 - final_result['x_a0']
final_result['inverted_y_a0'] = 68 - final_result['y_a0']
In [20]:
# Create a lookup dictionary for faster access
convex_hull_lookup = {}
for key, data in convex_hull.items():
    convex_hull_lookup[key] = data['hull_path']

# Group data by player-game combination
grouped_data = final_result.groupby(['convex_player_id', 'convex_player_name', 'convex_team_id', 'game_id'])

# Initialize an empty list to store results
inside_convex_result = []

# Process each group
for (convex_player_id, convex_player_name, team_id, game_id), group_df in tqdm(grouped_data, desc="Processing player-game combinations"):
    # Construct the key to retrieve the convex hull path
    matching_key = (convex_player_name, convex_player_id, team_id, game_id)
    
    # Check if the key exists in convex_hull_data
    if matching_key in convex_hull_lookup:
        hull_path = convex_hull_lookup[matching_key]
        
        # Get the coordinates for all events as a numpy array (more efficient)
        points = group_df[['inverted_x_a0', 'inverted_y_a0']].values
        
        # Vectorized check for which points are inside the hull
        if len(points) > 0:
            # Using a more efficient approach if hull_path.contains_point supports arrays
            try:
                # Try vectorized operation if supported
                inside_mask = np.array([hull_path.contains_point(point) for point in points])
                if np.any(inside_mask):
                    inside_rows = group_df.loc[inside_mask]
                    inside_convex_result.append(inside_rows)
            except Exception:
                # Fallback to loop if vectorized operation not supported
                inside_mask = [hull_path.contains_point(point) for point in points]
                if any(inside_mask):
                    inside_rows = group_df.loc[inside_mask]
                    inside_convex_result.append(inside_rows)
    
    # No need to call gc.collect() in each iteration
    # Will do it periodically instead

    # Periodic garbage collection (every 100 iterations)
    if len(inside_convex_result) % 100 == 0 and inside_convex_result:
        gc.collect()

# Final garbage collection before concat
gc.collect()

# Convert the list of DataFrames into a single DataFrame if there are results
if inside_convex_result:
    inside_convex_result_df = pd.concat(inside_convex_result, ignore_index=True)
else:
    inside_convex_result_df = pd.DataFrame()
Processing player-game combinations: 100%|██████████| 9031/9031 [00:21<00:00, 429.92it/s]
In [21]:
#Cleaning the dataframe before saving it
inside_convex_result_df = inside_convex_result_df.drop(['offensive_value', 'defensive_value'], axis=1).drop_duplicates()
inside_convex_result_df
Out[21]:
game_id original_event_id action_id period_id time_seconds team_id player_id x y dx ... is_recovery team_name fotmob_id fotmob_name vaep_value convex_player_name convex_player_id convex_team_id inverted_x_a0 inverted_y_a0
0 1834864 2.711097e+09 9371444 1 1177.50 87 315369.0 12.075 37.400 0.000 ... NaN Juventus 9885 Juventus 0.056515 Pepe Reina 2987 1290 12.075 37.400
1 1834864 2.711097e+09 9371445 1 1178.00 87 315369.0 12.915 37.264 -12.915 ... NaN Juventus 9885 Juventus -0.008968 Pepe Reina 2987 1290 12.915 37.264
2 1834864 2.711097e+09 9371445 1 1178.00 87 315369.0 12.915 37.264 -12.915 ... NaN Juventus 9885 Juventus -0.075889 Pepe Reina 2987 1290 12.915 37.264
3 1834864 2.711111e+09 9371959 1 2535.00 87 494525.0 14.595 35.700 -2.310 ... NaN Juventus 9885 Juventus -0.032546 Pepe Reina 2987 1290 14.595 35.700
4 1834864 2.711113e+09 9372046 1 2718.00 87 353991.0 14.910 34.748 0.000 ... NaN Juventus 9885 Juventus 0.069313 Pepe Reina 2987 1290 14.910 34.748
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
498972 1835056 2.774332e+09 8836782 1 2294.00 77 149029.0 31.080 3.740 25.830 ... NaN Lazio 8543 Lazio -0.006645 Kevin Martins 540598 269 31.080 3.740
498973 1835033 2.741948e+09 9227027 2 221.00 278 128839.0 37.485 29.036 -5.565 ... ball recovery Genoa 10233 Genoa 0.016769 Tommaso Rubino 544269 73 37.485 29.036
498974 1835033 NaN 9227627 2 1559.25 278 425115.0 40.950 25.908 24.465 ... NaN Genoa 10233 Genoa 0.000974 Tommaso Rubino 544269 73 40.950 25.908
498975 1835033 2.741955e+09 9227683 2 1728.00 278 317544.0 35.805 27.268 0.000 ... NaN Genoa 10233 Genoa 0.001061 Tommaso Rubino 544269 73 35.805 27.268
498976 1835033 2.741955e+09 9227684 2 1729.00 278 317544.0 35.805 27.268 -3.255 ... NaN Genoa 10233 Genoa -0.003097 Tommaso Rubino 544269 73 35.805 27.268

498977 rows × 32 columns

#CONCATENATION OF OLD (CONVEXX) + NEW (inside_convex_result_df) DATA BEFORE STORING (inside_convex_result_dfC)

In [22]:
inside_convex_result_dfC = pd.concat([convexXX, inside_convex_result_df]).sort_values(by=['action_id'], ascending=True)
In [23]:
#Check of season ids
inside_convex_result_dfC.season_id.unique()
Out[23]:
array([2425, 2024])
In [24]:
#Check of competition ids
inside_convex_result_dfC.competition_id.unique()
Out[24]:
array(['BEL-Jupiler Pro League', 'BRA-Brasileirão', 'ENG-Championship',
       'ENG-FA Cup', 'ENG-League Cup', 'ENG-League One', 'ENG-League Two',
       'ENG-Premier League', 'ESP-La Liga', 'EU-Champions League',
       'EU-Europa League', 'FRA-Ligue 1', 'GER-Bundesliga', 'ITA-Serie A',
       'NED-Eredivisie', 'POR-Liga Portugal', 'RUS-Premier League',
       'SCO-Premiership', 'USA-Major League Soccer'], dtype=object)
In [30]:
inside_convex_result_dfC.to_csv("actions_against_convex2425.csv")
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [25]:
# Count occurrences of each action_id
action_count = inside_convex_result_df['action_id'].value_counts().reset_index()
action_count.columns = ['action_id', 'action_count']

# Merge the action counts with the original DataFrame
inside_convex_result_df = inside_convex_result_df.merge(action_count, on='action_id', how='left')
In [26]:
#Creating a weight for each event so that if an event is in 3 different convex hull the value/burden is shared between those 3 players
inside_convex_result_df['weight'] = 1/inside_convex_result_df['action_count']
In [27]:
V = (
    inside_convex_result_df.groupby(['convex_player_name', 'convex_player_id', 'convex_team_id', 'season_id'])
    .apply(lambda x: pd.Series({
        'avg_goal_probability': (x['vaep_value'] * x['weight']).sum() / x['weight'].sum(),
        'weighted_sum': (x['vaep_value'] * x['weight']).sum(),
        'weighted_count': (x['weight']).sum()
    }))
    .reset_index()
)
In [28]:
# Convert 'minutes_played' to total minutes with error handling
def convert_to_minutes(time_str):
    try:
        # Convert to string in case it's a float (e.g., NaN)
        time_str = str(time_str)
        # Split the time string into minutes and seconds
        minutes, seconds = map(int, time_str.split(':'))
        # Convert total time to minutes (seconds converted to fraction of minutes)
        return minutes + seconds / 60
    except (ValueError, AttributeError):
        # Handle cases where the conversion fails (e.g., NaN or bad format)
        return 0  # or use `np.nan` if you prefer to mark as missing

# Apply the conversion function to the 'minutes_played' column
playersX['minutes_played_converted'] = playersX['minutes_played'].apply(convert_to_minutes)
In [29]:
#Adding minutes played for players to later filtering
mp = playersX.groupby(["player_name", "player_id", "team_id", "season_id"])["minutes_played_converted"].sum().reset_index(name='minutes_played')

mp = mp.rename(columns={
    "player_name": "convex_player_name",
    "player_id": "convex_player_id",
    "team_id": "convex_team_id",
})

V = V.merge(mp, on=['convex_player_name', 'convex_player_id', 'convex_team_id', 'season_id'], how='left')
V = V.drop_duplicates()
Va = V[V['minutes_played'] >= 500]
In [30]:
#Creation of the metric to judge our work
Va['vaep_padj'] = (500 * Va['weighted_sum'])/Va['weighted_count']
In [31]:
#Cleaning of the table
Va = Va[['convex_player_name', 'convex_player_id', 'convex_team_id', 'season_id', 'minutes_played', 'avg_goal_probability', 'vaep_padj']]
In [32]:
# Rename the columns and further cleaning
position0 = position[['player_name', 'player_id', 'season_id', 'position']].rename(
    columns={'player_name': 'convex_player_name', 'player_id': 'convex_player_id'})

Vb = Va.merge(position0).drop_duplicates()
#Exploring unique positions
Vb.position.unique()
Out[32]:
array(['LFB', 'RFB', 'LCB', 'LWB', 'AWL', 'CM', 'DM', 'RCB', 'HB', 'GK',
       'RWB', 'AWR', 'SS', 'ST', 'AML', 'CB', 'AM', 'AMR'], dtype=object)
In [33]:
# Define the position group function
def map_position_group(pos):
    if pos in ['CB', 'LCB', 'RCB']:
        return 'CB'
    elif pos in ['LWB', 'RWB', 'RFB', 'LFB']:
        return 'WB'
    elif pos in ['HB', 'DM', 'CM']:
        return 'CDM'
    elif pos in ['AMR', 'AM', 'AWL', 'AWR', 'AML']:
        return 'AMW'
    elif pos in ['SS', 'ST']:
        return 'ST'
    elif pos in ['GK']:
        return 'GK'
    else:
        return 'Other'

# Create a new column for position groups
Vb['position_group'] = Vb['position'].apply(map_position_group)
In [34]:
#Exploring the best for our metric in a given position
Vc = Vb[Vb['position_group'].isin(['CB'])]
Vc.sort_values("vaep_padj", ascending=True).head(20)
Out[34]:
convex_player_name convex_player_id convex_team_id season_id minutes_played avg_goal_probability vaep_padj position position_group
338 Sead Kolasinac 110260 300 2425 2452.450000 0.000955 0.477483 LCB CB
296 Pawel Dawidowicz 121577 76 2425 1552.516667 0.001202 0.601100 RCB CB
31 Andrea Carboni 390493 269 2425 1812.650000 0.001309 0.654730 LCB CB
13 Alessandro Bastoni 329665 75 2425 2792.450000 0.001460 0.729774 LCB CB
130 Gianluca Mancini 244804 84 2425 3381.483333 0.001535 0.767562 RCB CB
283 Odilon Kossounou 384516 300 2425 1075.450000 0.001612 0.806060 RCB CB
50 Berat Djimsiti 100962 300 2425 2828.000000 0.001686 0.842839 CB CB
378 Yann Bisseck 349126 75 2425 1707.950000 0.001789 0.894334 RCB CB
70 Daniele Ghilardi 486744 76 2425 1431.416667 0.001795 0.897595 LCB CB
48 Benjamin Pavard 259648 75 2425 2230.200000 0.001829 0.914666 RCB CB
131 Giorgio Altare 338476 85 2425 840.766667 0.001849 0.924484 RCB CB
61 Christian Kabasele 68393 86 2425 1004.166667 0.001869 0.934338 RCB CB
322 Saba Goglichidze 498982 272 2425 2161.750000 0.001936 0.967846 RCB CB
226 Mario Hermoso 344156 84 2425 628.033333 0.001978 0.988878 LCB CB
240 Mattia De Sciglio 107275 272 2425 859.216667 0.002022 1.010881 RCB CB
15 Alessandro Buongiorno 333373 276 2425 1922.033333 0.002061 1.030597 LCB CB
358 Thomas Kristensen 440029 86 2425 1328.050000 0.002064 1.032159 RCB CB
55 Bremer 339875 87 2425 678.650000 0.002079 1.039396 LCB CB
198 Lloyd Kelly 332419 87 2425 641.350000 0.002125 1.062487 LCB CB
301 Pierre Kalulu 391836 87 2425 2587.600000 0.002137 1.068335 LCB CB
In [35]:
#Visualizing actions that were assigned to a player in his convex hulls
from mplsoccer import Pitch, VerticalPitch, lines
import matplotlib.colors as mcolors
#Selecting player
filtered_events_df1 = inside_convex_result_df[inside_convex_result_df['convex_player_id'] == 100962]
#Creating the colormap
cmap = matplotlib.colormaps.get_cmap('bwr')
vmin = -0.006  # or set manually, e.g., vmin = 0
vmax = 0.019  # or set manually, e.g., vmax = 1
# Normalize the data to the specified range
norm = mcolors.Normalize(vmin=vmin, vmax=vmax)
# Apply the colormap and normalization to your data
colors = cmap(norm(filtered_events_df1['vaep_value']))
#Plotting the data
pitch = Pitch(pitch_type='custom', pitch_width=68, pitch_length=105, goal_type='box', linewidth=2, line_color='black', half=False)
bins = (6, 5)
fig, ax = pitch.draw(figsize=(16, 11), constrained_layout=True, tight_layout=False)
# plot the heatmap - darker colors = more passes originating from that square
cmap = mcolors.LinearSegmentedColormap.from_list("custom_red", ["#D7D1CF", "#FF0000"])
bs_heatmap = pitch.bin_statistic(filtered_events_df1.inverted_x_a0, filtered_events_df1.inverted_y_a0, statistic='count', bins=bins)
hm = pitch.heatmap(bs_heatmap, ax=ax, cmap=cmap, zorder = 3, alpha = 0.5)
scatter = pitch.scatter(filtered_events_df1.inverted_x_a0, filtered_events_df1.inverted_y_a0, ax=ax, edgecolor='#000000', facecolor=colors, s=50)
No description has been provided for this image
In [ ]:
 
In [ ]:
 
In [ ]: