In [1]:
# Import necessary libraries for data manipulation, visualization, and analysis
import socceraction
import socceraction.spadl as spadl
import matplotlib
import matplotlib.font_manager as fm
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
from tqdm.notebook import tqdm
from mplsoccer import Pitch, VerticalPitch, lines
from matplotlib.colors import LinearSegmentedColormap
from scipy.ndimage import gaussian_filter
# Suppress warnings
warnings.filterwarnings("ignore")
In [2]:
# Load custom fonts for visualization
fe_regular = fm.FontEntry(
fname='/Users/davidegualano/Documents/Python FTBLData/SourceSansPro-Regular.ttf',
name='SourceSansPro-Regular'
)
fe_semibold = fm.FontEntry(
fname='/Users/davidegualano/Documents/Python FTBLData/SourceSansPro-SemiBold.ttf',
name='SourceSansPro-SemiBold'
)
# Insert both fonts into the font manager
fm.fontManager.ttflist.insert(0, fe_regular)
fm.fontManager.ttflist.insert(1, fe_semibold)
# Set the font family to the custom regular font
matplotlib.rcParams['font.family'] = fe_regular.name
In [3]:
# Load datasets from CSV files
fb = pd.read_csv("teamsFOTMOB.csv", index_col = 0)
In [4]:
season = 2425
In [5]:
players = pd.read_csv(f"players{season}.csv", index_col = 0)
games = pd.read_csv(f"games{season}.csv", index_col = 0)
actions = pd.read_csv(f"actions{season}.csv", index_col = 0)
In [6]:
#Adding infos for the event data file
atomic = spadl.add_names(actions)
#Merging players infos with mapping of teams id between fotmob and whoscored
players = players.merge(fb, how="left")
In [7]:
#Conversion of time played format into seconds
def convert_to_seconds(time_str):
try:
# Convert to string in case it's a float (e.g., NaN)
time_str = str(time_str)
# Split the time string into minutes and seconds
minutes, seconds = map(int, time_str.split(':'))
# Convert total time to seconds (minutes converted to seconds)
return minutes * 60 + seconds
except (ValueError, AttributeError):
# Handle cases where the conversion fails (e.g., NaN or bad format)
return 0 # or use `np.nan` if you prefer to mark as missing
# Apply the conversion function to the 'minutes_played' column
players['seconds_played'] = players['minutes_played'].apply(convert_to_seconds)
players['start_second'] = players['start_second'].apply(convert_to_seconds)
players['end_second'] = players['end_second'].apply(convert_to_seconds)
In [8]:
#Keeping only players selected infos
players_info = players[["player_name", "player_id", 'game_id', "team_name", "fotmob_id", 'season_id', 'competition_id', 'seconds_played']]
In [9]:
#Merging events with players infos after elaborations
df0 = atomic.merge(players_info, how='left', on=['game_id', 'player_id'])
In [10]:
mp = players.groupby(['player_id', 'player_name', 'season_id', 'team_id'], observed=True)['seconds_played'].sum().reset_index(name='seconds_played')
mp['minutes_played'] = mp['seconds_played']/60
In [11]:
#Selecting event types I want to work on
df = df0[df0['type_name'].isin(['pass', 'dribble', 'interception', 'clearance', 'take_on', 'tackle', 'shot', 'bad_touch',
'keeper_pick_up', 'foul', 'keeper_save', 'cross', 'keeper_claim', 'goal', 'keeper_punch'])]
In [12]:
df.head()
Out[12]:
game_id | original_event_id | period_id | time_seconds | team_id | player_id | start_x | end_x | start_y | end_y | ... | end_y_a0 | type_name | result_name | bodypart_name | player_name | team_name | fotmob_id | season_id | competition_id | seconds_played | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1819070 | 2.754291e+09 | 1 | 0.0 | 752 | 68335.0 | 52.185 | 57.435 | 34.000 | 34.204 | ... | 33.796 | pass | success | foot | Denis Odoi | Royal Antwerp | 9988.0 | 2425.0 | BEL-Jupiler Pro League | 6150.0 |
1 | 1819070 | NaN | 1 | 0.5 | 752 | 106413.0 | 57.435 | 53.340 | 34.204 | 33.660 | ... | 34.340 | dribble | success | foot | Dennis Praet | Royal Antwerp | 9988.0 | 2425.0 | BEL-Jupiler Pro League | 4881.0 |
2 | 1819070 | 2.754291e+09 | 1 | 1.0 | 752 | 106413.0 | 53.340 | 63.840 | 33.660 | 54.536 | ... | 13.464 | pass | success | foot | Dennis Praet | Royal Antwerp | 9988.0 | 2425.0 | BEL-Jupiler Pro League | 4881.0 |
3 | 1819070 | NaN | 1 | 3.0 | 752 | 388097.0 | 63.840 | 54.495 | 54.536 | 58.208 | ... | 9.792 | dribble | success | foot | Jelle Bataille | Royal Antwerp | 9988.0 | 2425.0 | BEL-Jupiler Pro League | 6150.0 |
4 | 1819070 | 2.754291e+09 | 1 | 5.0 | 752 | 388097.0 | 54.495 | 73.500 | 58.208 | 42.024 | ... | 25.976 | pass | success | foot | Jelle Bataille | Royal Antwerp | 9988.0 | 2425.0 | BEL-Jupiler Pro League | 6150.0 |
5 rows × 27 columns
In [13]:
# Define the worker function at module level (outside any other function)
def process_combination(args):
"""Process a single player-team-season combination"""
import numpy as np
from scipy.ndimage import gaussian_filter
import pandas as pd
combo_idx, combinations_array, df, player_names, player_games_dict, player_game_times_dict = args
# Extract combination details
player_id, team_id, season_id = combinations_array[combo_idx]
# Get player data efficiently
dfx = df[(df['player_id'] == player_id) &
(df['team_id'] == team_id) &
(df['season_id'] == season_id)]
if dfx.empty:
return None
# Get player name from precomputed dict
player_name = player_names.get(player_id, "Unknown Player")
# Get games this player participated in from precomputed dict
combo_key = (player_id, team_id, season_id)
player_games = player_games_dict.get(combo_key, set())
# Initialize container for team actions
all_team_actions = []
# Process each game
for game_id in player_games:
# Get team actions for this game
game_team_actions = df[(df['team_id'] == team_id) &
(df['season_id'] == season_id) &
(df['game_id'] == game_id)]
# Check if we have player timing data
if player_game_times_dict:
player_game_key = (player_id, game_id)
player_periods = player_game_times_dict.get(player_game_key, [])
if player_periods:
# Process each period the player was on pitch
for period_data in player_periods:
# Create a boolean mask for filtering
start_mask = ((game_team_actions['period_id'] == period_data['start_period']) &
(game_team_actions['time_seconds'] >= period_data['start_second']))
end_mask = ((game_team_actions['period_id'] == period_data['end_period']) &
(game_team_actions['time_seconds'] <= period_data['end_second']))
# Combine masks and filter
period_actions = game_team_actions[start_mask | end_mask]
if not period_actions.empty:
all_team_actions.append(period_actions)
else:
# Fallback to all team actions
all_team_actions.append(game_team_actions)
else:
# No timing data available, use all team actions
all_team_actions.append(game_team_actions)
# Skip if no team actions found
if not all_team_actions:
return None
# Combine all team actions in one step
dfy = pd.concat(all_team_actions, ignore_index=True)
if dfy.empty:
return None
# Extract coordinates as numpy arrays for faster calculation
player_x = dfx.start_x.values
player_y = dfx.start_y.values
team_x = dfy.start_x.values
team_y = dfy.start_y.values
# We need to recreate the pitch object here since it's not picklable
from mplsoccer import Pitch
pitch = Pitch(pitch_type='custom', pitch_width=68, pitch_length=105, goal_type='box',
linewidth=1.25, line_color='#000000', line_zorder=2, pitch_color='#D7D1CF')
# Calculate bin statistics directly with numpy arrays
bin_statisticplayer = pitch.bin_statistic(player_x, player_y, statistic='count', bins=(105, 68))
bin_statisticteam = pitch.bin_statistic(team_x, team_y, statistic='count', bins=(105, 68))
# Use vectorized operations for normalization
normalized_statistic = np.zeros_like(bin_statisticplayer['statistic'])
mask = (bin_statisticteam['statistic'] > 0)
normalized_statistic[mask] = bin_statisticplayer['statistic'][mask] / bin_statisticteam['statistic'][mask]
# Apply Gaussian filter
filtered_statistic = gaussian_filter(normalized_statistic, 3)
return {
'player_id': player_id,
'player_name': player_name,
'team_id': team_id,
'season_id': season_id,
'statistic': filtered_statistic
}
def create_player_bin_statistics_df(df, player_games_df=None, n_jobs=1):
"""
Create bin statistics for each player-team combination and
store them in a DataFrame - optimized for performance.
Parameters:
-----------
df : pandas DataFrame
The event data containing player events
player_games_df : pandas DataFrame
DataFrame containing information about when players were on the pitch
n_jobs : int, default=1
Number of processes to use for parallel execution. Set >1 to use multiple cores.
Returns:
--------
pandas DataFrame
DataFrame with columns: player_id, player_name, team_id, season_id, statistic
"""
import numpy as np
from scipy.ndimage import gaussian_filter
from tqdm import tqdm
from collections import defaultdict
import pandas as pd
from multiprocessing import Pool
from matplotlib.colors import LinearSegmentedColormap
from mplsoccer import Pitch
# Get unique player-team combinations
combinations = df[['player_id', 'team_id', 'season_id']].drop_duplicates().reset_index(drop=True)
print(f"Found {len(combinations)} unique player-team-season combinations")
# Convert combinations to a simple numpy array for pickling
combinations_array = combinations.values
# Precompute lookups for faster access
# 1. Create player name lookup
player_names = dict(zip(df['player_id'], df['player_name']))
# 2. Create player games lookup
player_games_dict = defaultdict(set)
for _, row in df[['player_id', 'team_id', 'season_id', 'game_id']].iterrows():
key = (row['player_id'], row['team_id'], row['season_id'])
player_games_dict[key].add(row['game_id'])
# 3. Preindex player_games_df for faster lookup if provided
player_game_times_dict = {}
if player_games_df is not None:
for _, row in player_games_df.iterrows():
key = (row['player_id'], row['game_id'])
if key not in player_game_times_dict:
player_game_times_dict[key] = []
player_game_times_dict[key].append({
'start_period': row['start_period'],
'start_second': row['start_second'],
'end_period': row['end_period'],
'end_second': row['end_second']
})
# Process combinations (parallel or sequential)
args_list = [(i, combinations_array, df, player_names, player_games_dict, player_game_times_dict)
for i in range(len(combinations))]
if n_jobs > 1:
# Parallel processing
with Pool(processes=n_jobs) as pool:
all_results = list(tqdm(
pool.imap(process_combination, args_list),
total=len(combinations),
desc="Processing player combinations"
))
else:
# Sequential processing
all_results = [process_combination(args) for args in tqdm(args_list,
desc="Processing player combinations")]
# Filter out None results
results = [r for r in all_results if r is not None]
# Create DataFrame
result_df = pd.DataFrame(results)
print(f"Created DataFrame with {len(result_df)} rows")
return result_df
In [14]:
# For multi-core processing (use all available cores)
player_stats_df = create_player_bin_statistics_df(df, players)
Found 9859 unique player-team-season combinations
Processing player combinations: 100%|██████████| 9859/9859 [39:07<00:00, 4.20it/s]
Created DataFrame with 9852 rows
In [15]:
#Check the structure of the dataframe after adding team_name column
player_stats_df = player_stats_df.merge(fb[['team_id', 'team_name']])
player_stats_df
Out[15]:
player_id | player_name | team_id | season_id | statistic | team_name | |
---|---|---|---|---|---|---|
0 | 68335.0 | Denis Odoi | 752.0 | 2425.0 | [[0.0417474703252548, 0.040847603349596355, 0.... | Royal Antwerp |
1 | 106413.0 | Dennis Praet | 752.0 | 2425.0 | [[0.029253781215685632, 0.03454770587822273, 0... | Royal Antwerp |
2 | 388097.0 | Jelle Bataille | 752.0 | 2425.0 | [[0.09212127779509127, 0.08787376959553757, 0.... | Royal Antwerp |
3 | 69933.0 | Toby Alderweireld | 752.0 | 2425.0 | [[0.018920814067782926, 0.018380303110433316, ... | Royal Antwerp |
4 | 37502.0 | Tjaronn Chery | 752.0 | 2425.0 | [[0.030456096264926267, 0.03979106265170132, 0... | Royal Antwerp |
... | ... | ... | ... | ... | ... | ... |
9847 | 462097.0 | Sota Kitahara | 5973.0 | 2024.0 | [[0.0011733655462917167, 0.0011315123881704301... | Seattle |
9848 | 477308.0 | Adam Beaudry | 1120.0 | 2024.0 | [[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,... | Colorado |
9849 | 542900.0 | Ervin Torres | 29664.0 | 2024.0 | [[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,... | Austin FC |
9850 | 328504.0 | Emanuel Reynoso | 9293.0 | 2024.0 | [[0.0005024170892179867, 0.001031943622745739,... | Minnesota United |
9851 | 512799.0 | Cyprian Kachwele | 11134.0 | 2024.0 | [[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,... | Vancouver |
9852 rows × 6 columns
In [16]:
#Saving the table
player_stats_df.to_pickle(f"bin_statistic{season}.pkl")
In [ ]: