# Import necessary libraries for data manipulation, visualization, and analysis
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings("ignore")
import urllib
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.font_manager as fm
from tqdm import tqdm
from scipy.spatial import ConvexHull, QhullError
from matplotlib.path import Path
import socceraction
import socceraction.atomic.spadl as atomicspadl
import socceraction.spadl as spadl
import gc

# Load custom fonts for visualization
fe_regular = fm.FontEntry(
    fname='/Users/davidegualano/Documents/Python FTBLData/SourceSansPro-Regular.ttf',
    name='SourceSansPro-Regular'
)
fe_semibold = fm.FontEntry(
    fname='/Users/davidegualano/Documents/Python FTBLData/SourceSansPro-SemiBold.ttf',
    name='SourceSansPro-SemiBold'
)

# Insert both fonts into the font manager
fm.fontManager.ttflist.insert(0, fe_regular)
fm.fontManager.ttflist.insert(1, fe_semibold)

# Set the font family to the custom regular font
matplotlib.rcParams['font.family'] = fe_regular.name

# Load datasets from CSV files
VAEP = pd.read_csv("aVAEPactions.csv", index_col = 0) # ATOMIC VAEP values for actions
fb = pd.read_csv("teamsFOTMOB.csv", index_col = 0) # Ids mapping from FOTMOB and Whoscored
position = pd.read_csv("clustered_position.csv", index_col = 0) #File with positions cluster for players
players = pd.read_csv("players2425.csv", index_col = 0) #Players infos from games
teams = pd.read_csv("teams2425.csv", index_col = 0) #Teams in 2425 season
aactions = pd.read_csv("atomic_actions2425.csv", index_col = 0) #Event data 
games0 = pd.read_csv("games2425.csv", index_col = 0) #Games infos
games = games0[["game_id", "competition_id", "season_id", "game_date"]] #Games columns I want

#Loading and cleaning the dataset of events which are recoveries but are not identified as such in atomic event data framework
recoveries = pd.read_csv("recoveries_id2425.csv", index_col = 0)
recoveries['original_event_id'] = recoveries['event_id'].astype(int)
recoveries = recoveries.rename(columns={'type_name' : 'is_recovery'})
recoveries = recoveries[['game_id', 'is_recovery', 'original_event_id']]

#Already created event dataframe with actions correspongind to player in convex hull to concatenate with new events later on
convexXX = pd.read_csv("actions_against_convex2425.csv", index_col = 0)

#Adding info to atomic event data
aactions = atomicspadl.add_names(aactions)

#Conversion of time played format into seconds
def convert_to_seconds(time_str):
    try:
        # Convert to string in case it's a float (e.g., NaN)
        time_str = str(time_str)
        # Split the time string into minutes and seconds
        minutes, seconds = map(int, time_str.split(':'))
        # Convert total time to seconds (minutes converted to seconds)
        return minutes * 60 + seconds
    except (ValueError, AttributeError):
        # Handle cases where the conversion fails (e.g., NaN or bad format)
        return 0  # or use `np.nan` if you prefer to mark as missing

# Apply the conversion function to the 'minutes_played' column
players['seconds_played'] = players['minutes_played'].apply(convert_to_seconds)
players['start_second'] = players['start_second'].apply(convert_to_seconds)
players['end_second'] = players['end_second'].apply(convert_to_seconds)

#Creating the event dataframe to work on
dfa = (
    aactions
    .merge(games, how="left")
    .merge(recoveries, how="left")
    .merge(fb, how="left")
    .merge(VAEP, how="left")
    )

#Creating the player dataframe to work on
playersX = (
    players
    .merge(fb, how="left")
    .merge(position, how="left"))

#List of competition in dataset
dfa.competition_id.unique()

array(['BEL-Jupiler Pro League', 'BRA-Brasileirão', 'ENG-Championship',
       'ENG-FA Cup', 'ENG-League Cup', 'ENG-League One', 'ENG-League Two',
       'ENG-Premier League', 'ESP-La Liga', 'EU-Champions League',
       'EU-Europa League', 'FRA-Ligue 1', 'GER-Bundesliga', 'ITA-Serie A',
       'NED-Eredivisie', 'POR-Liga Portugal', 'RUS-Premier League',
       'SCO-Premiership', 'USA-Major League Soccer'], dtype=object)

#Defining the competition I'd go look for
competition = ['ITA-Serie A']

#Filtering the actions for specific type so to keep only those in the list and dribbles that are not recoveries
actionsX = dfa[dfa["type_name"].isin(['pass', 'receival', 'dribble', 'out',  'take_on', 'shot', 'offside', 'bad_touch', 'cross', 'goal'])]
A0 = actionsX[actionsX['is_recovery'] != 'ball recovery']

#Filtering the action on which to draw the convex hulls
CNVX0 = dfa[dfa["type_name"].isin(['pass', 'receival', 'dribble', 'out',  'take_on', 'shot', 'offside', 'bad_touch', 'cross', 'goal',
                                 'dribble', 'interception', 'clearance', 'tackle', 'foul'])]

#Filtering which players we want to draw convex hulls for
playersX = players[players["seconds_played"] >= 1]
P0 = playersX[playersX['starting_position'] != 'GK']

#Filtering by competition
A0 = actionsX[actionsX["competition_id"].isin(competition)]
P0 = playersX[playersX["competition_id"].isin(competition)]
CNVX0 = CNVX0[CNVX0["competition_id"].isin(competition)]

#Further adjustements on the convex hulls events – we don't want to keep what happens in the first two third of the pitch
CNVX0 = CNVX0.merge(P0[['game_id', 'team_id', 'player_id', 'player_name']])
CNVX0 = CNVX0[CNVX0["x_a0"] <= 70]

# Initialize an empty list to collect the filtered events
results = []

for _, player in tqdm(P0.iterrows(), total=len(P0), desc="Processing players"):
   # Extract player information
    player_name = player['player_name']
    player_id = player['player_id']
    start_second = player['start_second']
    end_second = player['end_second']
    start_period = player['start_period']
    end_period = player['end_period']
    team_id = player['team_id']
    game_id = player['game_id']  
    season = player['season_id']  

   # Filter actions for a specific game and exclude the given team_id
    actions_against = A0[(A0['game_id'] == game_id) & (A0['team_id'] != team_id)]
    
    actions_against0 = actions_against[
        ((actions_against['period_id'] == start_period) & 
         (actions_against['time_seconds'] >= start_second)) |
        ((actions_against['period_id'] == end_period) & 
         (actions_against['time_seconds'] <= end_second))]
        
    # Add player information
    actions_against0['convex_player_name'] = player_name
    actions_against0['convex_player_id'] = player_id
    actions_against0['convex_team_id'] = team_id
    actions_against0['season_id'] = season

    results.append(actions_against0)

# Combine results into a single DataFrame for this partition
final_result = pd.concat(results, ignore_index=True) if results else pd.DataFrame()

Processing players: 100%|██████████| 9031/9031 [00:07<00:00, 1129.81it/s]

# Group by 'played_id' and 'game_id' and then apply the filtering within each group
filtered_dfa = CNVX0.groupby(['player_id', 'player_name', 'game_id']).apply(lambda group: 
    group[
        (group['y_a0'] >= group['y_a0'].mean() - group['y_a0'].std()) &
        (group['y_a0'] <= group['y_a0'].mean() + group['y_a0'].std()) &
        (group['x_a0'] >= group['x_a0'].mean() - group['x_a0'].std()) &
        (group['x_a0'] <= group['x_a0'].mean() + group['x_a0'].std())
    ]
).reset_index(drop=True)

# Initialize a dictionary to store convex hull data with unique keys
convex_hull = {}

# Loop through each (player_name, team_name, game_id) to calculate the convex hulls
for (player_name, player_id, team_id, game_id), group in filtered_dfa.groupby(['player_name', 'player_id', 'team_id', 'game_id']):
    points = group[['x_a0', 'y_a0']].values
    
    if len(points) >= 3:  # Convex hull requires at least 3 points
        try:
            # Attempt to create a convex hull if points are not collinear
            hull = ConvexHull(points)
            hull_points = points[hull.vertices]
            hull_path = Path(hull_points)  # Create a Path for hull boundary

            # Use a unique key (could be a tuple of relevant identifiers)
            key = (player_name, player_id, team_id, game_id)
            convex_hull[key] = {
                'hull_path': hull_path  # Store the Path object
            }

        except QhullError:
            # Skip the calculation if points are collinear or insufficient
            continue
    else:
        # Skip if not enough points for a hull
        continue

# Now, convex_hull_data is a dictionary with hull_path as a value.
for key, value in convex_hull.items():
    hull_path = value['hull_path']

#Creating coordinates from the point of view of the defender
final_result['inverted_x_a0'] = 105 - final_result['x_a0']
final_result['inverted_y_a0'] = 68 - final_result['y_a0']

# Create a lookup dictionary for faster access
convex_hull_lookup = {}
for key, data in convex_hull.items():
    convex_hull_lookup[key] = data['hull_path']

# Group data by player-game combination
grouped_data = final_result.groupby(['convex_player_id', 'convex_player_name', 'convex_team_id', 'game_id'])

# Initialize an empty list to store results
inside_convex_result = []

# Process each group
for (convex_player_id, convex_player_name, team_id, game_id), group_df in tqdm(grouped_data, desc="Processing player-game combinations"):
    # Construct the key to retrieve the convex hull path
    matching_key = (convex_player_name, convex_player_id, team_id, game_id)
    
    # Check if the key exists in convex_hull_data
    if matching_key in convex_hull_lookup:
        hull_path = convex_hull_lookup[matching_key]
        
        # Get the coordinates for all events as a numpy array (more efficient)
        points = group_df[['inverted_x_a0', 'inverted_y_a0']].values
        
        # Vectorized check for which points are inside the hull
        if len(points) > 0:
            # Using a more efficient approach if hull_path.contains_point supports arrays
            try:
                # Try vectorized operation if supported
                inside_mask = np.array([hull_path.contains_point(point) for point in points])
                if np.any(inside_mask):
                    inside_rows = group_df.loc[inside_mask]
                    inside_convex_result.append(inside_rows)
            except Exception:
                # Fallback to loop if vectorized operation not supported
                inside_mask = [hull_path.contains_point(point) for point in points]
                if any(inside_mask):
                    inside_rows = group_df.loc[inside_mask]
                    inside_convex_result.append(inside_rows)
    
    # No need to call gc.collect() in each iteration
    # Will do it periodically instead

    # Periodic garbage collection (every 100 iterations)
    if len(inside_convex_result) % 100 == 0 and inside_convex_result:
        gc.collect()

# Final garbage collection before concat
gc.collect()

# Convert the list of DataFrames into a single DataFrame if there are results
if inside_convex_result:
    inside_convex_result_df = pd.concat(inside_convex_result, ignore_index=True)
else:
    inside_convex_result_df = pd.DataFrame()

Processing player-game combinations: 100%|██████████| 9031/9031 [00:21<00:00, 429.92it/s]

#Cleaning the dataframe before saving it
inside_convex_result_df = inside_convex_result_df.drop(['offensive_value', 'defensive_value'], axis=1).drop_duplicates()
inside_convex_result_df

inside_convex_result_dfC = pd.concat([convexXX, inside_convex_result_df]).sort_values(by=['action_id'], ascending=True)

#Check of season ids
inside_convex_result_dfC.season_id.unique()

array([2425, 2024])

#Check of competition ids
inside_convex_result_dfC.competition_id.unique()

array(['BEL-Jupiler Pro League', 'BRA-Brasileirão', 'ENG-Championship',
       'ENG-FA Cup', 'ENG-League Cup', 'ENG-League One', 'ENG-League Two',
       'ENG-Premier League', 'ESP-La Liga', 'EU-Champions League',
       'EU-Europa League', 'FRA-Ligue 1', 'GER-Bundesliga', 'ITA-Serie A',
       'NED-Eredivisie', 'POR-Liga Portugal', 'RUS-Premier League',
       'SCO-Premiership', 'USA-Major League Soccer'], dtype=object)

inside_convex_result_dfC.to_csv("actions_against_convex2425.csv")

# Count occurrences of each action_id
action_count = inside_convex_result_df['action_id'].value_counts().reset_index()
action_count.columns = ['action_id', 'action_count']

# Merge the action counts with the original DataFrame
inside_convex_result_df = inside_convex_result_df.merge(action_count, on='action_id', how='left')

#Creating a weight for each event so that if an event is in 3 different convex hull the value/burden is shared between those 3 players
inside_convex_result_df['weight'] = 1/inside_convex_result_df['action_count']

V = (
    inside_convex_result_df.groupby(['convex_player_name', 'convex_player_id', 'convex_team_id', 'season_id'])
    .apply(lambda x: pd.Series({
        'avg_goal_probability': (x['vaep_value'] * x['weight']).sum() / x['weight'].sum(),
        'weighted_sum': (x['vaep_value'] * x['weight']).sum(),
        'weighted_count': (x['weight']).sum()
    }))
    .reset_index()
)

# Convert 'minutes_played' to total minutes with error handling
def convert_to_minutes(time_str):
    try:
        # Convert to string in case it's a float (e.g., NaN)
        time_str = str(time_str)
        # Split the time string into minutes and seconds
        minutes, seconds = map(int, time_str.split(':'))
        # Convert total time to minutes (seconds converted to fraction of minutes)
        return minutes + seconds / 60
    except (ValueError, AttributeError):
        # Handle cases where the conversion fails (e.g., NaN or bad format)
        return 0  # or use `np.nan` if you prefer to mark as missing

# Apply the conversion function to the 'minutes_played' column
playersX['minutes_played_converted'] = playersX['minutes_played'].apply(convert_to_minutes)

#Adding minutes played for players to later filtering
mp = playersX.groupby(["player_name", "player_id", "team_id", "season_id"])["minutes_played_converted"].sum().reset_index(name='minutes_played')

mp = mp.rename(columns={
    "player_name": "convex_player_name",
    "player_id": "convex_player_id",
    "team_id": "convex_team_id",
})

V = V.merge(mp, on=['convex_player_name', 'convex_player_id', 'convex_team_id', 'season_id'], how='left')
V = V.drop_duplicates()
Va = V[V['minutes_played'] >= 500]

#Creation of the metric to judge our work
Va['vaep_padj'] = (500 * Va['weighted_sum'])/Va['weighted_count']

#Cleaning of the table
Va = Va[['convex_player_name', 'convex_player_id', 'convex_team_id', 'season_id', 'minutes_played', 'avg_goal_probability', 'vaep_padj']]

# Rename the columns and further cleaning
position0 = position[['player_name', 'player_id', 'season_id', 'position']].rename(
    columns={'player_name': 'convex_player_name', 'player_id': 'convex_player_id'})

Vb = Va.merge(position0).drop_duplicates()
#Exploring unique positions
Vb.position.unique()

array(['LFB', 'RFB', 'LCB', 'LWB', 'AWL', 'CM', 'DM', 'RCB', 'HB', 'GK',
       'RWB', 'AWR', 'SS', 'ST', 'AML', 'CB', 'AM', 'AMR'], dtype=object)

# Define the position group function
def map_position_group(pos):
    if pos in ['CB', 'LCB', 'RCB']:
        return 'CB'
    elif pos in ['LWB', 'RWB', 'RFB', 'LFB']:
        return 'WB'
    elif pos in ['HB', 'DM', 'CM']:
        return 'CDM'
    elif pos in ['AMR', 'AM', 'AWL', 'AWR', 'AML']:
        return 'AMW'
    elif pos in ['SS', 'ST']:
        return 'ST'
    elif pos in ['GK']:
        return 'GK'
    else:
        return 'Other'

# Create a new column for position groups
Vb['position_group'] = Vb['position'].apply(map_position_group)

#Exploring the best for our metric in a given position
Vc = Vb[Vb['position_group'].isin(['CB'])]
Vc.sort_values("vaep_padj", ascending=True).head(20)

#Visualizing actions that were assigned to a player in his convex hulls
from mplsoccer import Pitch, VerticalPitch, lines
import matplotlib.colors as mcolors
#Selecting player
filtered_events_df1 = inside_convex_result_df[inside_convex_result_df['convex_player_id'] == 100962]
#Creating the colormap
cmap = matplotlib.colormaps.get_cmap('bwr')
vmin = -0.006  # or set manually, e.g., vmin = 0
vmax = 0.019  # or set manually, e.g., vmax = 1
# Normalize the data to the specified range
norm = mcolors.Normalize(vmin=vmin, vmax=vmax)
# Apply the colormap and normalization to your data
colors = cmap(norm(filtered_events_df1['vaep_value']))
#Plotting the data
pitch = Pitch(pitch_type='custom', pitch_width=68, pitch_length=105, goal_type='box', linewidth=2, line_color='black', half=False)
bins = (6, 5)
fig, ax = pitch.draw(figsize=(16, 11), constrained_layout=True, tight_layout=False)
# plot the heatmap - darker colors = more passes originating from that square
cmap = mcolors.LinearSegmentedColormap.from_list("custom_red", ["#D7D1CF", "#FF0000"])
bs_heatmap = pitch.bin_statistic(filtered_events_df1.inverted_x_a0, filtered_events_df1.inverted_y_a0, statistic='count', bins=bins)
hm = pitch.heatmap(bs_heatmap, ax=ax, cmap=cmap, zorder = 3, alpha = 0.5)
scatter = pitch.scatter(filtered_events_df1.inverted_x_a0, filtered_events_df1.inverted_y_a0, ax=ax, edgecolor='#000000', facecolor=colors, s=50)

	game_id	original_event_id	action_id	period_id	time_seconds	team_id	player_id	x	y	dx	...	is_recovery	team_name	fotmob_id	fotmob_name	vaep_value	convex_player_name	convex_player_id	convex_team_id	inverted_x_a0	inverted_y_a0
0	1834864	2.711097e+09	9371444	1	1177.50	87	315369.0	12.075	37.400	0.000	...	NaN	Juventus	9885	Juventus	0.056515	Pepe Reina	2987	1290	12.075	37.400
1	1834864	2.711097e+09	9371445	1	1178.00	87	315369.0	12.915	37.264	-12.915	...	NaN	Juventus	9885	Juventus	-0.008968	Pepe Reina	2987	1290	12.915	37.264
2	1834864	2.711097e+09	9371445	1	1178.00	87	315369.0	12.915	37.264	-12.915	...	NaN	Juventus	9885	Juventus	-0.075889	Pepe Reina	2987	1290	12.915	37.264
3	1834864	2.711111e+09	9371959	1	2535.00	87	494525.0	14.595	35.700	-2.310	...	NaN	Juventus	9885	Juventus	-0.032546	Pepe Reina	2987	1290	14.595	35.700
4	1834864	2.711113e+09	9372046	1	2718.00	87	353991.0	14.910	34.748	0.000	...	NaN	Juventus	9885	Juventus	0.069313	Pepe Reina	2987	1290	14.910	34.748
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
498972	1835056	2.774332e+09	8836782	1	2294.00	77	149029.0	31.080	3.740	25.830	...	NaN	Lazio	8543	Lazio	-0.006645	Kevin Martins	540598	269	31.080	3.740
498973	1835033	2.741948e+09	9227027	2	221.00	278	128839.0	37.485	29.036	-5.565	...	ball recovery	Genoa	10233	Genoa	0.016769	Tommaso Rubino	544269	73	37.485	29.036
498974	1835033	NaN	9227627	2	1559.25	278	425115.0	40.950	25.908	24.465	...	NaN	Genoa	10233	Genoa	0.000974	Tommaso Rubino	544269	73	40.950	25.908
498975	1835033	2.741955e+09	9227683	2	1728.00	278	317544.0	35.805	27.268	0.000	...	NaN	Genoa	10233	Genoa	0.001061	Tommaso Rubino	544269	73	35.805	27.268
498976	1835033	2.741955e+09	9227684	2	1729.00	278	317544.0	35.805	27.268	-3.255	...	NaN	Genoa	10233	Genoa	-0.003097	Tommaso Rubino	544269	73	35.805	27.268

	convex_player_name	convex_player_id	convex_team_id	season_id	minutes_played	avg_goal_probability	vaep_padj	position	position_group
338	Sead Kolasinac	110260	300	2425	2452.450000	0.000955	0.477483	LCB	CB
296	Pawel Dawidowicz	121577	76	2425	1552.516667	0.001202	0.601100	RCB	CB
31	Andrea Carboni	390493	269	2425	1812.650000	0.001309	0.654730	LCB	CB
13	Alessandro Bastoni	329665	75	2425	2792.450000	0.001460	0.729774	LCB	CB
130	Gianluca Mancini	244804	84	2425	3381.483333	0.001535	0.767562	RCB	CB
283	Odilon Kossounou	384516	300	2425	1075.450000	0.001612	0.806060	RCB	CB
50	Berat Djimsiti	100962	300	2425	2828.000000	0.001686	0.842839	CB	CB
378	Yann Bisseck	349126	75	2425	1707.950000	0.001789	0.894334	RCB	CB
70	Daniele Ghilardi	486744	76	2425	1431.416667	0.001795	0.897595	LCB	CB
48	Benjamin Pavard	259648	75	2425	2230.200000	0.001829	0.914666	RCB	CB
131	Giorgio Altare	338476	85	2425	840.766667	0.001849	0.924484	RCB	CB
61	Christian Kabasele	68393	86	2425	1004.166667	0.001869	0.934338	RCB	CB
322	Saba Goglichidze	498982	272	2425	2161.750000	0.001936	0.967846	RCB	CB
226	Mario Hermoso	344156	84	2425	628.033333	0.001978	0.988878	LCB	CB
240	Mattia De Sciglio	107275	272	2425	859.216667	0.002022	1.010881	RCB	CB
15	Alessandro Buongiorno	333373	276	2425	1922.033333	0.002061	1.030597	LCB	CB
358	Thomas Kristensen	440029	86	2425	1328.050000	0.002064	1.032159	RCB	CB
55	Bremer	339875	87	2425	678.650000	0.002079	1.039396	LCB	CB
198	Lloyd Kelly	332419	87	2425	641.350000	0.002125	1.062487	LCB	CB
301	Pierre Kalulu	391836	87	2425	2587.600000	0.002137	1.068335	LCB	CB