In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.font_manager as fm
import matplotlib.colors as mcolors
import matplotlib.patheffects as path_effects
from mplsoccer import Pitch, VerticalPitch, lines
from PIL import Image
import urllib
import socceraction
import socceraction.atomic.spadl as atomicspadl
In [2]:
# Load custom fonts for visualization
fe_regular = fm.FontEntry(
fname='/Users/davidegualano/Documents/Python FTBLData/SourceSansPro-Regular.ttf',
name='SourceSansPro-Regular'
)
semibold_font = fm.FontProperties(fname='/Users/davidegualano/Documents/Python FTBLData/SourceSansPro-SemiBold.ttf')
# Insert both fonts into the font manager
fm.fontManager.ttflist.insert(0, fe_regular)
# Set the font family to the custom regular font
matplotlib.rcParams['font.family'] = fe_regular.name
In [3]:
season = 2425
In [4]:
# Load datasets from CSV files
VAEP = pd.read_csv("aVAEPactions.csv", index_col = 0)
fb = pd.read_csv("teamsFOTMOB.csv", index_col = 0)
players = pd.read_csv(f"players{season}.csv", index_col = 0)
games = pd.read_csv(f"games{season}.csv", index_col = 0)
aactions = pd.read_csv(f"atomic_actions{season}.csv", index_col = 0)
chains = pd.read_csv(f"possession_chains_info{season}.csv", index_col = 0)
recoveries = pd.read_csv(f"recoveries_id{season}.csv", index_col = 0)
recoveries.rename(columns = {'event_id':'original_event_id'}, inplace = True)
In [5]:
# Add descriptive action names to the atomic actions DataFrame
aactions = atomicspadl.add_names(aactions)
In [6]:
# Merge datasets to create a unified DataFrame
dfa = (
aactions
.merge(players, how="left")
.merge(fb, how="left")
.merge(VAEP, how="left"))
In [7]:
#Adding on both frames a set of features that we need to define progressive actions
dfa["beginning_distance"] = np.sqrt(np.square(105-dfa['x_a0']) + np.square(34-dfa['y_a0'])).round(2)
dfa["end_distance"] = np.sqrt(np.square(105-dfa['end_x']) + np.square(34-dfa['end_y'])).round(2)
dfa["length"] = dfa["end_distance"] - dfa["beginning_distance"]
dfa['length'] = dfa['length'].abs()
dfa["angle"] = np.arctan2(dfa["end_y"] - dfa["y_a0"], dfa["end_x"] - dfa["x_a0"])
dfa['angle_degrees'] = np.degrees(dfa['angle']) % 360
In [8]:
#Adding features for next actions we'll use for calculations and plotting later on
dfa["next_type_name"] = dfa.shift(-1, fill_value=0)["type_name"]
dfa["next_team_name"] = dfa.shift(-1, fill_value=0)["team_name"]
dfa["next_starting_position"] = dfa.shift(-1, fill_value=0)["starting_position"]
dfa["next_player_name"] = dfa.shift(-1, fill_value=0)["player_name"]
dfa["next_original_event_id"] = dfa.shift(-1, fill_value=0)["original_event_id"]
dfa["next_start_x"] = dfa.shift(-1, fill_value=0)["x_a0"]
dfa["next_start_y"] = dfa.shift(-1, fill_value=0)["y_a0"]
dfa["next_end_x"] = dfa.shift(-1, fill_value=0)["end_x"]
dfa["next_end_y"] = dfa.shift(-1, fill_value=0)["end_y"]
dfa["next_vaep_value"] = dfa.shift(-1, fill_value=0)["vaep_value"]
In [9]:
#Flagging progressive actions given a custom defintion of passes not starting in the box, with lenght of more than 5m
#not backwards or horizontal and the action closes the distance from starting point to center of the goal by at least 17.5%
dfa['progressive'] = np.where(
((dfa['beginning_distance'] - dfa['end_distance']) / dfa['beginning_distance'] >= 0.175) & (dfa['length'] > 5) &
(((dfa['angle_degrees'] >= 0) & (dfa['angle_degrees'] <= 60)) | ((dfa['angle_degrees'] >= 260) & (dfa['angle_degrees'] <= 360))) &
~((dfa['x_a0'] >= 88.5) & (dfa['y_a0'] >= 13.885) & (dfa['y_a0'] <= 54.115)),
True, False)
In [10]:
# Function to format season ID into a readable format
def format_season_id(season_id):
# Convert to integer if it's a float
season_id = int(season_id)
# Extract the last two digits of the year
start_year = str(season_id -1)[-2:]
# Calculate the end year
end_year = str(season_id)[-2:]
# Format as 20/21
formatted_season = f"{start_year}/{end_year}"
return formatted_season
In [11]:
# Convert 'minutes_played' to total minutes with error handling
def convert_to_minutes(time_str):
try:
# Convert to string in case it's a float (e.g., NaN)
time_str = str(time_str)
# Split the time string into minutes and seconds
minutes, seconds = map(int, time_str.split(':'))
# Convert total time to minutes (seconds converted to fraction of minutes)
return minutes + seconds / 60
except (ValueError, AttributeError):
# Handle cases where the conversion fails (e.g., NaN or bad format)
return 0 # or use `np.nan` if you prefer to mark as missing
# Apply the conversion function to the 'minutes_played' column
players['minutes_played_converted'] = players['minutes_played'].apply(convert_to_minutes)
In [12]:
#We look at the duration of each game in the dataset
minutesadj = players.groupby(["game_id", "game_duration"], observed=True)['is_starter'].count().reset_index(name='is_starter')
# Apply the conversion function to the 'minutes_played' column
minutesadj['game_duration_converted'] = minutesadj['game_duration'].apply(convert_to_minutes)
#We find the median duration of games in the dataset to normalize for that instead of 90'
minutesadj = minutesadj.game_duration_converted.median()
minutesadj
Out[12]:
98.46666666666667
In [13]:
#Creating a table in which each player has his total of minutes played in the season and merge with team_name column
mp = players.groupby(["player_id", "player_name", "team_id"])["minutes_played_converted"].sum().reset_index(name='minutes_played')
mp = mp.merge(fb[['team_id', 'team_name']])
In [14]:
#Keeping only carries in the dataframe
df1a = dfa[dfa["type_name"] == 'dribble']
In [15]:
#Get the whole list of players in the data
playerlist = df1a['player_name'].unique().tolist()
cleaned_playerlist = [name for name in playerlist if pd.notna(name)]
cleaned_playerlist.sort()
In [16]:
from IPython.display import display, HTML
# Generate the HTML dropdown to easily search for players
options_html = ''.join([f'<option value="{name}">{name}</option>' for name in cleaned_playerlist])
dropdown_html = f"""
<input list="players" id="dropdown" oninput="handleInput()" placeholder="Choose Someone">
<datalist id="players">
{options_html}
</datalist>
<p id="output"></p>
<script>
function handleInput() {{
var input = document.getElementById("dropdown").value;
var output = document.getElementById("output");
output.innerHTML = "Selected: " + input;
}}
</script>
"""
# Display the dropdown
display(HTML(dropdown_html))
In [17]:
#Selecting player and filtering
player = 'Tino Anjorin'
df2a = df1a[df1a["player_name"] == player]
#Check the teams he played for in the season/dataset to filter down eventually
df2a.team_name.unique()
Out[17]:
array(['Empoli'], dtype=object)
In [18]:
# Apply the function to the 'season_id' column
df2a['formatted_season'] = df2a['season_id'].apply(format_season_id)
/var/folders/ns/3wxdg4g57h77vxwmr4wzmvt40000gn/T/ipykernel_62848/2406992386.py:2: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy df2a['formatted_season'] = df2a['season_id'].apply(format_season_id)
In [19]:
#Of course we keep this lines if we want to keep action for specific teams
df3a = df2a[df2a["team_name"] == "Empoli"]
In [20]:
#Selecting only progressive passes
df4a = df3a[df3a["progressive"] == True]
In [21]:
# Group once and calculate all metrics in a single operation
metrics = (df4a.groupby(
["player_id", "player_name", "team_name", "season_id"],
observed=True
).agg(
aVAEP=("vaep_value", "sum"),
aVAEP_next=("next_vaep_value", "sum")
).reset_index())
#We merge the amount of minutes played
metrics = metrics.merge(mp)
#Creating normalized metrics now that we have minutes played
metrics["aVAEP"] = metrics.aVAEP * minutesadj / metrics.minutes_played
metrics["aVAEP_next"] = metrics.aVAEP_next * minutesadj / metrics.minutes_played
In [22]:
# setting up the pitch, bins and figure
pitch = Pitch(pitch_type='custom', pitch_width=68, pitch_length=105, goal_type='box', linewidth=2, line_color='black', half=False)
bins = (9, 6)
fig, ax = pitch.draw(figsize=(16, 11), constrained_layout=True, tight_layout=False)
# plot the heatmap - more intense red = more carries originating from that square
cmap = mcolors.LinearSegmentedColormap.from_list("custom_red", ["#D7D1CF", "#FF0000"])
bs_heatmap = pitch.bin_statistic(df4a.x_a0, df4a.y_a0, statistic='count', bins=bins)
hm = pitch.heatmap(bs_heatmap, ax=ax, cmap=cmap, zorder = 3, alpha = 0.8)
# plot the underliying carries
am = pitch.arrows(df4a.x_a0, df4a.y_a0, df4a.end_x, df4a.end_y, width=0.8, alpha = 0.5, zorder = 1,
headwidth = 10, headlength = 8, color = '#000000', label = 'successful passes', ax=ax)
# Calculate sum value of vaep value for each bin
bs_vaep = pitch.bin_statistic(df4a.x_a0, df4a.y_a0, values=df4a.vaep_value, statistic='sum', bins=bins)
# Add text annotations for each bin with the sum vaep value
for i in range(bs_vaep['statistic'].shape[0]):
for j in range(bs_vaep['statistic'].shape[1]):
# Get the bin center coordinates
bin_center_x, bin_center_y = bs_vaep['cx'][i, j], bs_vaep['cy'][i, j]
# Get the sum value of vaep_net for the bin
sum_vaep = bs_vaep['statistic'][i, j]
# Add annotation if sum value is not close to zero
if (sum_vaep < -0.01) | (sum_vaep > 0.01):
text = ax.text(bin_center_x, bin_center_y, f"{sum_vaep:.3f}", color="#FFFFFF", ha="center", va="center",
fontsize=20, zorder=4)
# Add path effects for the edge
text.set_path_effects([path_effects.Stroke(linewidth=4, foreground='#000000'), path_effects.Normal()])
#Adding notes and titles
ax.text(0.5, 1.06, f"{player}'s progressive carries for {df3a.team_name.unique()[0]}", fontsize=25, va='center', ha='center', transform=ax.transAxes, fontproperties=semibold_font)
ax.text(0.5, 1.01, f"Carries : {(df4a['type_name'].count())} | Carries aVAEP per 98 : {metrics['aVAEP'].unique()[0].round(3)} | aVAEP following action per 98 : {metrics['aVAEP_next'].unique()[0].round(3)} | {', '.join(df3a['competition_id'].unique())} {', '.join(df3a['formatted_season'].unique())}\nHeatmap: amount of carries from that zone | Annotations: aVAEP from zone",
fontsize=12, va='center', ha='center', transform=ax.transAxes)
ax.text(0.5, 0.02, f'X: @gualanodavide | Bluesky: @gualanodavide.bsky.social | Linkedin: www.linkedin.com/in/davide-gualano-a2454b187 | Newsletter: the-cutback.beehiiv.com',
fontsize=10, va='center', ha='center', transform=ax.transAxes)
ax.annotate(text="", xy=(65, 69.5), xytext=(-200, 0), textcoords="offset points", size=27, color="#000000",
arrowprops=dict(arrowstyle="-|>", shrinkA=0, color="black", linewidth=2))
#Adding team logo
fotmob_url = "https://images.fotmob.com/image_resources/logo/teamlogo/"
logo_ax = fig.add_axes([.135, 0.85, 0.09, 0.09], zorder=1)
club_icon = Image.open(urllib.request.urlopen(f"{fotmob_url}{df4a['fotmob_id'].iloc[0]}.png"))
logo_ax.imshow(club_icon)
logo_ax.axis("off")
# Save the figure with adjusted face color and transparency
plt.savefig(f'{player}-progressive-{season}.png', dpi=500, facecolor="#D7D1CF", bbox_inches="tight", transparent=True)
In [23]:
# Group once and calculate all metrics in a single operation
metrics = (df3a.groupby(
["player_id", "player_name", "team_name", "season_id"],
observed=True
).agg(
aVAEP=("vaep_value", "sum"),
aVAEP_next=("next_vaep_value", "sum")
).reset_index())
#We merge the amount of minutes played
metrics = metrics.merge(mp)
#Creating normalized metrics now that we have minutes played
metrics["aVAEP"] = metrics.aVAEP * minutesadj / metrics.minutes_played
metrics["aVAEP_next"] = metrics.aVAEP_next * minutesadj / metrics.minutes_played
In [24]:
# setting up the pitch, bins and figure
pitch = Pitch(pitch_type='custom', pitch_width=68, pitch_length=105, goal_type='box', linewidth=2, line_color='black', half=False)
bins = (9, 6)
fig, ax = pitch.draw(figsize=(16, 11), constrained_layout=True, tight_layout=False)
# plot the heatmap - more intense red = more carries originating from that square
cmap = mcolors.LinearSegmentedColormap.from_list("custom_red", ["#D7D1CF", "#FF0000"])
bs_heatmap = pitch.bin_statistic(df3a.x_a0, df3a.y_a0, statistic='count', bins=bins)
hm = pitch.heatmap(bs_heatmap, ax=ax, cmap=cmap, zorder = 3, alpha = 0.8)
# plot the underliying carries
am = pitch.arrows(df3a.x_a0, df3a.y_a0, df3a.end_x, df3a.end_y, width=0.8, alpha = 0.5, zorder = 1,
headwidth = 10, headlength = 8, color = '#000000', label = 'successful passes', ax=ax)
# Calculate sum value of vaep value for each bin
bs_vaep = pitch.bin_statistic(df3a.x_a0, df3a.y_a0, values=df3a.vaep_value, statistic='sum', bins=bins)
# Add text annotations for each bin with the sum vaep value
for i in range(bs_vaep['statistic'].shape[0]):
for j in range(bs_vaep['statistic'].shape[1]):
# Get the bin center coordinates
bin_center_x, bin_center_y = bs_vaep['cx'][i, j], bs_vaep['cy'][i, j]
# Get the sum value of vaep_net for the bin
sum_vaep = bs_vaep['statistic'][i, j]
# Add annotation if sum value is not close to zero
if (sum_vaep < -0.01) | (sum_vaep > 0.01):
text = ax.text(bin_center_x, bin_center_y, f"{sum_vaep:.3f}", color="#FFFFFF", ha="center", va="center",
fontsize=20, zorder=4)
# Add path effects for the edge
text.set_path_effects([path_effects.Stroke(linewidth=4, foreground='#000000'), path_effects.Normal()])
#Adding notes and titles
ax.text(0.5, 1.06, f"{player}'s carries for {df3a.team_name.unique()[0]}", fontsize=25, va='center', ha='center', transform=ax.transAxes, fontproperties=semibold_font)
ax.text(0.5, 1.01, f"Carries : {(df3a['type_name'].count())} | Carries aVAEP per 98 : {metrics['aVAEP'].unique()[0].round(3)} | aVAEP following action per 98 : {metrics['aVAEP_next'].unique()[0].round(3)} | {', '.join(df3a['competition_id'].unique())} {', '.join(df3a['formatted_season'].unique())}\nHeatmap: amount of carries from that zone | Annotations: aVAEP from zone",
fontsize=12, va='center', ha='center', transform=ax.transAxes)
ax.text(0.5, 0.02, f'X: @gualanodavide | Bluesky: @gualanodavide.bsky.social | Linkedin: www.linkedin.com/in/davide-gualano-a2454b187 | Newsletter: the-cutback.beehiiv.com',
fontsize=10, va='center', ha='center', transform=ax.transAxes)
ax.annotate(text="", xy=(65, 69.5), xytext=(-200, 0), textcoords="offset points", size=27, color="#000000",
arrowprops=dict(arrowstyle="-|>", shrinkA=0, color="black", linewidth=2))
#Adding team logo
fotmob_url = "https://images.fotmob.com/image_resources/logo/teamlogo/"
logo_ax = fig.add_axes([.135, 0.85, 0.09, 0.09], zorder=1)
club_icon = Image.open(urllib.request.urlopen(f"{fotmob_url}{df4a['fotmob_id'].iloc[0]}.png"))
logo_ax.imshow(club_icon)
logo_ax.axis("off")
# Save the figure with adjusted face color and transparency
plt.savefig(f'{player}-overall-{season}.png', dpi=500, facecolor="#D7D1CF", bbox_inches="tight", transparent=True)
In [ ]:
In [ ]:
In [ ]: