In [1]:
# Import necessary libraries for data manipulation, visualization, and analysis
import socceraction
import socceraction.spadl as spadl
import matplotlib
import matplotlib.font_manager as fm
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
from tqdm.notebook import tqdm
from mplsoccer import Pitch, VerticalPitch, lines
from matplotlib.colors import LinearSegmentedColormap
from scipy.ndimage import gaussian_filter

# Suppress warnings
warnings.filterwarnings("ignore")
In [2]:
# Load custom fonts for visualization
fe_regular = fm.FontEntry(
    fname='/Users/davidegualano/Documents/Python FTBLData/SourceSansPro-Regular.ttf',
    name='SourceSansPro-Regular'
)
fe_semibold = fm.FontEntry(
    fname='/Users/davidegualano/Documents/Python FTBLData/SourceSansPro-SemiBold.ttf',
    name='SourceSansPro-SemiBold'
)

# Insert both fonts into the font manager
fm.fontManager.ttflist.insert(0, fe_regular)
fm.fontManager.ttflist.insert(1, fe_semibold)

# Set the font family to the custom regular font
matplotlib.rcParams['font.family'] = fe_regular.name
In [3]:
# Load datasets from CSV files
fb = pd.read_csv("teamsFOTMOB.csv", index_col = 0)
In [4]:
season = 2425
In [5]:
players = pd.read_csv(f"players{season}.csv", index_col = 0)
games = pd.read_csv(f"games{season}.csv", index_col = 0)
actions = pd.read_csv(f"actions{season}.csv", index_col = 0)
In [6]:
#Adding infos for the event data file
atomic = spadl.add_names(actions)

#Merging players infos with mapping of teams id between fotmob and whoscored
players = players.merge(fb, how="left")
In [7]:
#Conversion of time played format into seconds
def convert_to_seconds(time_str):
    try:
        # Convert to string in case it's a float (e.g., NaN)
        time_str = str(time_str)
        # Split the time string into minutes and seconds
        minutes, seconds = map(int, time_str.split(':'))
        # Convert total time to seconds (minutes converted to seconds)
        return minutes * 60 + seconds
    except (ValueError, AttributeError):
        # Handle cases where the conversion fails (e.g., NaN or bad format)
        return 0  # or use `np.nan` if you prefer to mark as missing

# Apply the conversion function to the 'minutes_played' column
players['seconds_played'] = players['minutes_played'].apply(convert_to_seconds)
players['start_second'] = players['start_second'].apply(convert_to_seconds)
players['end_second'] = players['end_second'].apply(convert_to_seconds)
In [8]:
#Keeping only players selected infos
players_info = players[["player_name", "player_id", 'game_id', "team_name", "fotmob_id", 'season_id', 'competition_id', 'seconds_played']]
In [9]:
#Merging events with players infos after elaborations
df0 = atomic.merge(players_info, how='left', on=['game_id', 'player_id'])
In [10]:
mp = players.groupby(['player_id', 'player_name', 'season_id', 'team_id'], observed=True)['seconds_played'].sum().reset_index(name='seconds_played')
mp['minutes_played'] = mp['seconds_played']/60
In [11]:
#Selecting event types I want to work on
df = df0[df0['type_name'].isin(['pass', 'dribble', 'interception', 'clearance', 'take_on', 'tackle', 'shot', 'bad_touch',
                                 'keeper_pick_up', 'foul', 'keeper_save', 'cross', 'keeper_claim', 'goal', 'keeper_punch'])]
In [12]:
df.head()
Out[12]:
game_id original_event_id period_id time_seconds team_id player_id start_x end_x start_y end_y ... end_y_a0 type_name result_name bodypart_name player_name team_name fotmob_id season_id competition_id seconds_played
0 1819070 2.754291e+09 1 0.0 752 68335.0 52.185 57.435 34.000 34.204 ... 33.796 pass success foot Denis Odoi Royal Antwerp 9988.0 2425.0 BEL-Jupiler Pro League 6150.0
1 1819070 NaN 1 0.5 752 106413.0 57.435 53.340 34.204 33.660 ... 34.340 dribble success foot Dennis Praet Royal Antwerp 9988.0 2425.0 BEL-Jupiler Pro League 4881.0
2 1819070 2.754291e+09 1 1.0 752 106413.0 53.340 63.840 33.660 54.536 ... 13.464 pass success foot Dennis Praet Royal Antwerp 9988.0 2425.0 BEL-Jupiler Pro League 4881.0
3 1819070 NaN 1 3.0 752 388097.0 63.840 54.495 54.536 58.208 ... 9.792 dribble success foot Jelle Bataille Royal Antwerp 9988.0 2425.0 BEL-Jupiler Pro League 6150.0
4 1819070 2.754291e+09 1 5.0 752 388097.0 54.495 73.500 58.208 42.024 ... 25.976 pass success foot Jelle Bataille Royal Antwerp 9988.0 2425.0 BEL-Jupiler Pro League 6150.0

5 rows × 27 columns

In [13]:
# Define the worker function at module level (outside any other function)
def process_combination(args):
    """Process a single player-team-season combination"""
    import numpy as np
    from scipy.ndimage import gaussian_filter
    import pandas as pd
    
    combo_idx, combinations_array, df, player_names, player_games_dict, player_game_times_dict = args
    
    # Extract combination details
    player_id, team_id, season_id = combinations_array[combo_idx]
    
    # Get player data efficiently
    dfx = df[(df['player_id'] == player_id) & 
             (df['team_id'] == team_id) & 
             (df['season_id'] == season_id)]
    
    if dfx.empty:
        return None
    
    # Get player name from precomputed dict
    player_name = player_names.get(player_id, "Unknown Player")
    
    # Get games this player participated in from precomputed dict
    combo_key = (player_id, team_id, season_id)
    player_games = player_games_dict.get(combo_key, set())
    
    # Initialize container for team actions
    all_team_actions = []
    
    # Process each game
    for game_id in player_games:
        # Get team actions for this game
        game_team_actions = df[(df['team_id'] == team_id) & 
                               (df['season_id'] == season_id) &
                               (df['game_id'] == game_id)]
        
        # Check if we have player timing data
        if player_game_times_dict:
            player_game_key = (player_id, game_id)
            player_periods = player_game_times_dict.get(player_game_key, [])
            
            if player_periods:
                # Process each period the player was on pitch
                for period_data in player_periods:
                    # Create a boolean mask for filtering
                    start_mask = ((game_team_actions['period_id'] == period_data['start_period']) & 
                                 (game_team_actions['time_seconds'] >= period_data['start_second']))
                    end_mask = ((game_team_actions['period_id'] == period_data['end_period']) & 
                               (game_team_actions['time_seconds'] <= period_data['end_second']))
                    
                    # Combine masks and filter
                    period_actions = game_team_actions[start_mask | end_mask]
                    
                    if not period_actions.empty:
                        all_team_actions.append(period_actions)
            else:
                # Fallback to all team actions
                all_team_actions.append(game_team_actions)
        else:
            # No timing data available, use all team actions
            all_team_actions.append(game_team_actions)
    
    # Skip if no team actions found
    if not all_team_actions:
        return None
        
    # Combine all team actions in one step
    dfy = pd.concat(all_team_actions, ignore_index=True)
    
    if dfy.empty:
        return None
    
    # Extract coordinates as numpy arrays for faster calculation
    player_x = dfx.start_x.values
    player_y = dfx.start_y.values
    team_x = dfy.start_x.values
    team_y = dfy.start_y.values
    
    # We need to recreate the pitch object here since it's not picklable
    from mplsoccer import Pitch
    pitch = Pitch(pitch_type='custom', pitch_width=68, pitch_length=105, goal_type='box', 
                 linewidth=1.25, line_color='#000000', line_zorder=2, pitch_color='#D7D1CF')
    
    # Calculate bin statistics directly with numpy arrays
    bin_statisticplayer = pitch.bin_statistic(player_x, player_y, statistic='count', bins=(105, 68))
    bin_statisticteam = pitch.bin_statistic(team_x, team_y, statistic='count', bins=(105, 68))
    
    # Use vectorized operations for normalization
    normalized_statistic = np.zeros_like(bin_statisticplayer['statistic'])
    mask = (bin_statisticteam['statistic'] > 0)
    normalized_statistic[mask] = bin_statisticplayer['statistic'][mask] / bin_statisticteam['statistic'][mask]
    
    # Apply Gaussian filter
    filtered_statistic = gaussian_filter(normalized_statistic, 3)
    
    return {
        'player_id': player_id,
        'player_name': player_name,
        'team_id': team_id,
        'season_id': season_id,
        'statistic': filtered_statistic
    }


def create_player_bin_statistics_df(df, player_games_df=None, n_jobs=1):
    """
    Create bin statistics for each player-team combination and 
    store them in a DataFrame - optimized for performance.
    
    Parameters:
    -----------
    df : pandas DataFrame
        The event data containing player events
    player_games_df : pandas DataFrame
        DataFrame containing information about when players were on the pitch
    n_jobs : int, default=1
        Number of processes to use for parallel execution. Set >1 to use multiple cores.
        
    Returns:
    --------
    pandas DataFrame
        DataFrame with columns: player_id, player_name, team_id, season_id, statistic
    """
    import numpy as np
    from scipy.ndimage import gaussian_filter
    from tqdm import tqdm
    from collections import defaultdict
    import pandas as pd
    from multiprocessing import Pool
    from matplotlib.colors import LinearSegmentedColormap
    from mplsoccer import Pitch
    
    # Get unique player-team combinations
    combinations = df[['player_id', 'team_id', 'season_id']].drop_duplicates().reset_index(drop=True)
    print(f"Found {len(combinations)} unique player-team-season combinations")
    
    # Convert combinations to a simple numpy array for pickling
    combinations_array = combinations.values
    
    # Precompute lookups for faster access
    # 1. Create player name lookup
    player_names = dict(zip(df['player_id'], df['player_name']))
    
    # 2. Create player games lookup
    player_games_dict = defaultdict(set)
    for _, row in df[['player_id', 'team_id', 'season_id', 'game_id']].iterrows():
        key = (row['player_id'], row['team_id'], row['season_id'])
        player_games_dict[key].add(row['game_id'])
    
    # 3. Preindex player_games_df for faster lookup if provided
    player_game_times_dict = {}
    if player_games_df is not None:
        for _, row in player_games_df.iterrows():
            key = (row['player_id'], row['game_id'])
            if key not in player_game_times_dict:
                player_game_times_dict[key] = []
            player_game_times_dict[key].append({
                'start_period': row['start_period'],
                'start_second': row['start_second'],
                'end_period': row['end_period'],
                'end_second': row['end_second']
            })
    
    # Process combinations (parallel or sequential)
    args_list = [(i, combinations_array, df, player_names, player_games_dict, player_game_times_dict)
                for i in range(len(combinations))]
    
    if n_jobs > 1:
        # Parallel processing
        with Pool(processes=n_jobs) as pool:
            all_results = list(tqdm(
                pool.imap(process_combination, args_list),
                total=len(combinations),
                desc="Processing player combinations"
            ))
    else:
        # Sequential processing
        all_results = [process_combination(args) for args in tqdm(args_list, 
                                                          desc="Processing player combinations")]
    
    # Filter out None results
    results = [r for r in all_results if r is not None]
    
    # Create DataFrame
    result_df = pd.DataFrame(results)
    print(f"Created DataFrame with {len(result_df)} rows")
    
    return result_df
In [14]:
# For multi-core processing (use all available cores)
player_stats_df = create_player_bin_statistics_df(df, players)
Found 9859 unique player-team-season combinations
Processing player combinations: 100%|██████████| 9859/9859 [39:07<00:00,  4.20it/s]  
Created DataFrame with 9852 rows

In [15]:
#Check the structure of the dataframe after adding team_name column
player_stats_df = player_stats_df.merge(fb[['team_id', 'team_name']])
player_stats_df
Out[15]:
player_id player_name team_id season_id statistic team_name
0 68335.0 Denis Odoi 752.0 2425.0 [[0.0417474703252548, 0.040847603349596355, 0.... Royal Antwerp
1 106413.0 Dennis Praet 752.0 2425.0 [[0.029253781215685632, 0.03454770587822273, 0... Royal Antwerp
2 388097.0 Jelle Bataille 752.0 2425.0 [[0.09212127779509127, 0.08787376959553757, 0.... Royal Antwerp
3 69933.0 Toby Alderweireld 752.0 2425.0 [[0.018920814067782926, 0.018380303110433316, ... Royal Antwerp
4 37502.0 Tjaronn Chery 752.0 2425.0 [[0.030456096264926267, 0.03979106265170132, 0... Royal Antwerp
... ... ... ... ... ... ...
9847 462097.0 Sota Kitahara 5973.0 2024.0 [[0.0011733655462917167, 0.0011315123881704301... Seattle
9848 477308.0 Adam Beaudry 1120.0 2024.0 [[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,... Colorado
9849 542900.0 Ervin Torres 29664.0 2024.0 [[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,... Austin FC
9850 328504.0 Emanuel Reynoso 9293.0 2024.0 [[0.0005024170892179867, 0.001031943622745739,... Minnesota United
9851 512799.0 Cyprian Kachwele 11134.0 2024.0 [[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,... Vancouver

9852 rows × 6 columns

In [16]:
#Saving the table
player_stats_df.to_pickle(f"bin_statistic{season}.pkl")
In [ ]: