In [1]:
#Loading all the packages that we need 
import matplotlib
import pandas as pd
import numpy as np
import warnings
import urllib
from PIL import Image
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.font_manager as fm
import matplotlib.colors as mcolors
from matplotlib.colors import LinearSegmentedColormap
from matplotlib.patches import RegularPolygon
import matplotlib.patheffects as path_effects
from mplsoccer import Pitch, VerticalPitch, lines
from scipy.ndimage import gaussian_filter

import socceraction
import socceraction.atomic.spadl as atomicspadl
In [2]:
#Importing the fonts
fe_regular = fm.FontEntry(
    fname='/Users/davidegualano/Documents/Python FTBLData/SourceSansPro-Regular.ttf',
    name='SourceSansPro-Regular'
)
fe_semibold = fm.FontEntry(
    fname='/Users/davidegualano/Documents/Python FTBLData/SourceSansPro-SemiBold.ttf',
    name='SourceSansPro-SemiBold'
)

fe_medium = fm.FontEntry(
    fname='/Users/davidegualano/Documents/Python FTBLData/Shentox-W01-Medium.ttf',
    name='Shentox-Medium'
)
fe_bold = fm.FontEntry(
    fname='/Users/davidegualano/Documents/Python FTBLData/Shentox-W01-Bold.ttf',
    name='Shentox-Bold'
)

# Insert both fonts into the font manager
fm.fontManager.ttflist.insert(0, fe_regular)
fm.fontManager.ttflist.insert(1, fe_semibold)
fm.fontManager.ttflist.insert(2, fe_medium)
fm.fontManager.ttflist.insert(3, fe_bold)

# Set the font family
matplotlib.rcParams['font.family'] = fe_regular.name  # Default to Regular
In [3]:
#Choosing the season for which we want to look at the data
season = 2425
In [4]:
#Loading the data containing files
fb = pd.read_csv("teamsFOTMOB.csv", index_col = 0)
players = pd.read_csv(f"players{season}.csv", index_col = 0)
games = pd.read_csv(f"games{season}.csv", index_col = 0)
actions = pd.read_csv(f"atomic_actions{season}.csv", index_col = 0)
positions = pd.read_csv("clustered_position.csv", index_col = 0)
VAEP = pd.read_csv("aVAEPactions.csv", index_col = 0)
In [5]:
#Filtering players and games files so to have only important features
games = games[["game_id", "game_date", "competition_id", "season_id"]]
games['game_date'] = pd.to_datetime(games['game_date'])
players_info = players[['game_id', 'team_id', 'player_id', 'player_name', 'season_id', 'competition_id']]
In [6]:
#Using the spadl framework to add names to the actions
actions = atomicspadl.add_names(actions)
In [7]:
#Merging all the files to the actions so to have all the relevant informations
df = (
    actions
    .merge(VAEP, how="left")
    .merge(fb, how="left")
    .merge(games, how="left")
    .merge(players_info, how="left")
)
In [8]:
#Creating other features we want from scratch
df["angle"] = np.arctan2(df["end_y"] - df["y_a0"], df["end_x"] - df["x_a0"])
df['angle_degrees'] = np.degrees(df['angle']) % 360
df["action_distance"] = np.sqrt((df["end_x"] - df["x_a0"])**2 + (df["end_y"] - df["y_a0"])**2).round(2)
df['duration'] = df['time_seconds'].shift(-1) - df['time_seconds']

# Creating a column with meters per second of the actions where is possible
df["instantaneous_speed_euclid"] = df["action_distance"] / df["duration"]
df['instantaneous_speed_euclid'] = df['instantaneous_speed_euclid'].fillna(0)
df['instantaneous_speed_euclid'] = df['instantaneous_speed_euclid'].replace([float('inf'), float('-inf')], 0)
In [9]:
# Creating a function to keep only those actions that are forward more or less
def is_forward_angle(angle_deg, forward_range=45):
    """
    Determine if an angle is considered "forward" (left to right)
    
    Parameters:
    - angle_deg: Angle in degrees (0-360)
    - forward_range: How many degrees on either side of straight forward to consider forward
    """
    # Forward is toward the right of the pitch (around 0 degrees)
    return (angle_deg >= 360 - forward_range) or (angle_deg <= forward_range)

# Define your thresholds
forward_angle_range = 60  # Degrees to consider as "forward" on either side

# Creating a boolean column for filtering
df['is_forward'] = df['angle_degrees'].apply(
    lambda x: is_forward_angle(x, forward_range=forward_angle_range)
)
In [10]:
#Adding features from next/previosu actions we'll use later
df["next_team_name"] = df["team_name"].shift(-1, fill_value=0)

df["next_player_id"] = df["player_id"].shift(-1, fill_value=0)
df["prev_player_id"] = df["player_id"].shift(+1, fill_value=0)

df["next_type_name"] = df["type_name"].shift(-1, fill_value=0)
df["prev_type_name"] = df["type_name"].shift(+1, fill_value=0)

df["prev_instantaneous_speed_euclid"] = df["instantaneous_speed_euclid"].shift(+1, fill_value=0)
df["prev_action_distance"] = df["action_distance"].shift(+1, fill_value=0)

df["next_vaep_value"] = df["vaep_value"].shift(-1, fill_value=0)
df["next_is_forward"] = df["is_forward"].shift(-1, fill_value=0)
In [11]:
#Filtering the dataframe to have take ons that are standstill:
#Which means they are in less than 4 meters from the end point of the previous action
#And the previous action was perfomed at less than 1.5 meters per second
takeon0 = df[df['type_name'] == 'take_on']
takeon1 = takeon0[takeon0['prev_action_distance'] < 4]
takeon = takeon1[takeon1['prev_instantaneous_speed_euclid'] < 1.5]
In [12]:
#Defining what's successful, the same player needs to do another action after that
#and that action has to be a vertical one
succtakeon0 = takeon[takeon['player_id'] == takeon['next_player_id']]
succtakeon = succtakeon0[succtakeon0['next_is_forward']]
In [13]:
#Creating dataframes for total of that take ons and succesful
total_takeons = takeon.groupby(["player_id", "player_name"], observed=True)['type_name'].count().reset_index(name='take_ons')
succ_takeons = succtakeon.groupby(["player_id", "player_name"], observed=True)['type_name'].count().reset_index(name='succ_take_ons')
In [14]:
#Creating dataframes for total value of that take ons and succesful
total_takeons_v = succtakeon0.groupby(["player_id", "player_name"], observed=True)['next_vaep_value'].sum().reset_index(name='take_ons_value')
succ_takeons_v = succtakeon.groupby(["player_id", "player_name"], observed=True)['next_vaep_value'].sum().reset_index(name='succ_take_ons_value')
In [15]:
#Merging together the dataframes
A = (total_takeons
     .merge(succ_takeons, how='left')
     .merge(total_takeons_v, how='left')
     .merge(succ_takeons_v, how='left')
    ).fillna(0)

#Creating a pct of succesful column
A['pct'] = ((A['succ_take_ons']/A['take_ons'])*100).round(2)
In [16]:
# Convert 'minutes_played' to total minutes with error handling
def convert_to_minutes(time_str):
    try:
        # Convert to string in case it's a float (e.g., NaN)
        time_str = str(time_str)
        # Split the time string into minutes and seconds
        minutes, seconds = map(int, time_str.split(':'))
        # Convert total time to minutes (seconds converted to fraction of minutes)
        return minutes + seconds / 60
    except (ValueError, AttributeError):
        # Handle cases where the conversion fails (e.g., NaN or bad format)
        return 0  # or use `np.nan` if you prefer to mark as missing

# Apply the conversion function to the 'minutes_played' column
players['minutes_played_converted'] = players['minutes_played'].apply(convert_to_minutes)
In [17]:
#We look at the duration of each game in the dataset
minutesadj = players.groupby(["game_id", "game_duration"], observed=True)['is_starter'].count().reset_index(name='is_starter')

# Apply the conversion function to the 'minutes_played' column
minutesadj['game_duration_converted'] = minutesadj['game_duration'].apply(convert_to_minutes)

#We find the median duration of games in the dataset to normalize for that instead of 90'
minutesadj = minutesadj.game_duration_converted.median()
minutesadj
Out[17]:
98.48333333333333
In [18]:
#Creating a table in which each player has his total of minutes played in the season and merge with team_name column
mp = players.groupby(["player_id", "player_name", "team_id"])["minutes_played_converted"].sum().reset_index(name='minutes_played')
mp = mp.merge(fb[['team_id', 'team_name']])
In [19]:
#Filtering the position dataframe to keep only those position mapping for players of the selected season
#So to not have doubles
positions0 = positions[positions['season_id'] == int(season)]

#Merging togther position, minutes played and metrics dataframe
B = (A
        .merge(positions0)
        .merge(mp))
In [20]:
#Creating normalized metrics now that we have minutes played
B["take_ons_98"] = B.take_ons * minutesadj / B.minutes_played
B["succ_take_ons_98"] = B.succ_take_ons * minutesadj / B.minutes_played

B["take_ons_value_98"] = (B.take_ons_value * minutesadj / B.minutes_played).round(4)
B["succ_take_ons_value_98"] = (B.succ_take_ons_value * minutesadj / B.minutes_played).round(4)

#Keep only players with at least 1000 miuntes played
B_final = B[B['minutes_played'] > 999]
In [21]:
#Filtering for columns we want to keep and looking at the unique position group inside
X = B_final[['player_id', 'player_name', 'team_name', 'season_id', 'position', 'position_group', 'minutes_played', 'take_ons_98',
            'succ_take_ons_98', 'take_ons_value_98', 'succ_take_ons_value_98', 'pct']]

X.position_group.unique()
Out[21]:
array(['GK', 'CDM', 'AMW', 'CB', 'WB', 'ST'], dtype=object)
In [22]:
#Selecting a position group as we don't want to compare apples with pears
Z = X[X["position_group"] == 'AMW']
In [23]:
#Selecting a metric to explore the results for
metric = 'succ_take_ons_98'
In [24]:
#Creating the dataframe for the vsiualization
W = Z[['player_id', 'player_name', 'team_name', 'season_id', 'position', 
       'position_group', 'minutes_played', metric, 'pct']].sort_values(
       by=[metric], ascending=False).reset_index(drop=True).head(10)

#Rounding the selected metric for making it pleasing to the eye
W[metric] = W[metric].round(4)

#Sorting the top 10 to make it work inside the visualization
Y = W.sort_values(by = [metric], ascending = True)

Y
Out[24]:
player_id player_name team_name season_id position position_group minutes_played succ_take_ons_98 pct
9 469507.0 Arthur Atta Udinese 2425.0 AM AMW 1310.133333 1.2027 50.00
8 421935.0 Kelly N'Mai Salford City 2425.0 AWL AMW 2774.633333 1.2068 39.53
7 406676.0 Óscar Aranda Famalicao 2425.0 AML AMW 2755.400000 1.2152 34.34
6 117980.0 Bruma Braga 2425.0 AWL AMW 1850.133333 1.2775 41.38
5 455842.0 Yoann Cathline FC Utrecht 2425.0 AWL AMW 2048.200000 1.3463 41.18
4 399311.0 Filip Krastev PEC Zwolle 2425.0 AML AMW 2353.366667 1.3810 41.25
3 339874.0 Marquinhos Spartak Moscow 2425.0 AWL AMW 1910.216667 1.3920 46.55
2 370984.0 Khvicha Kvaratskhelia Napoli 2425.0 AWL AMW 1256.150000 1.4112 24.66
1 519755.0 Ibrahim Osman Feyenoord 2425.0 AWR AMW 1247.166667 1.4214 47.37
0 400237.0 Robert Navarro Mallorca 2425.0 AWR AMW 1355.766667 1.5254 53.85
In [25]:
#Setting the figure, the axes and the dimension of the figure to make it all fit pleasingly
fig = plt.figure(figsize=(1800/500, 1800/500), dpi=500)
ax = plt.subplot()

ncols = Y.shape[1]
nrows = Y.shape[0]

ax.set_xlim(0, ncols + 1)
ax.set_ylim(0, nrows + 1)

position = [0.1, 5, 6.5, 8.5]
columns = ['player_name', 'team_name', 'pct', metric]

#Conditioning for names in different columns
for i in range(nrows):
    for j, column in enumerate(columns):
        if j == 0:
            ha = 'left'
        else:
            ha = 'center'
        if column == metric:
            fontsize = 10
            color = '#FFFFFF'
            fontname = fe_semibold.name
        elif column == 'team_name':
            fontsize = 4  
            color = '#4E616C' 
            fontname = fe_regular.name
        else:
            fontsize = 8
            color = '#000000' 
            fontname = fe_semibold.name
        ax.annotate(
            xy=(position[j], i + .5), text=str(Y[column].iloc[i]), ha=ha, va='center', fontsize=fontsize, color=color, fontname=fontname)

# Add dividing lines and color for the column to highlight
ax.plot([ax.get_xlim()[0], ax.get_xlim()[1]], [nrows, nrows], lw=1.5, color='black', marker='', zorder=4)
ax.plot([ax.get_xlim()[0], ax.get_xlim()[1]], [0, 0], lw=1.5, color='black', marker='', zorder=4)
for x in range(1, nrows):
    ax.plot([ax.get_xlim()[0], ax.get_xlim()[1]], [x, x], lw=0.5, color='gray', ls='-', zorder=3 , marker='')
    
    ax.fill_between(x=[7.5, 9.5], y1=nrows, y2=0, color='#D32F2F', alpha=0.5, ec='None')

# Adding titles and notes with conditioning based on the metric we use
if metric == 'take_ons_98':
    plt.text(0.5, 0.91, 'Number of Take Ons from a standstill', transform=fig.transFigure,
         horizontalalignment='center', fontsize=9, fontfamily='SourceSansPro-SemiBold')
elif metric == 'succ_take_ons_98':
    plt.text(0.5, 0.91, 'Number of successful Take Ons from a standstill', transform=fig.transFigure,
         horizontalalignment='center', fontsize=9, fontfamily='SourceSansPro-SemiBold')
elif metric == 'take_ons_value_98':
    plt.text(0.5, 0.91, 'Value of Take Ons from a standstill', transform=fig.transFigure,
         horizontalalignment='center', fontsize=9, fontfamily='SourceSansPro-SemiBold')
elif metric == 'succ_take_ons_value_98':
    plt.text(0.5, 0.91, 'Value of successful Take Ons from a standstill', transform=fig.transFigure,
         horizontalalignment='center', fontsize=9, fontfamily='SourceSansPro-SemiBold')

plt.text(0.5, 0.86, f"Take-ons from a standstill: Must be performed within 4 meters of the previous action's end point,\nand the preceding action must have been executed at less than 1.5 meters per second", transform=fig.transFigure,
         horizontalalignment='center', fontsize=5)

plt.text(0.5, 0.83, f'Attacking Midfielders and Wingers | Minimum 1000 minutes played | Percentage of success | Normalized per 98 minutes',
         transform=fig.transFigure, horizontalalignment='center', fontsize = 4, color = '#4E616C')
fig.suptitle(f'X: @gualanodavide | Bluesky: @gualanodavide.bsky.social | Linkedin: www.linkedin.com/in/davide-gualano-a2454b187 | Newsletter: the-cutback.beehiiv.com',
             horizontalalignment='center', x = 0.5125, y = 0.09, fontsize=3, color = "#000000")

#Saving and showing
ax.set_axis_off()
plt.savefig(f'TOP_DA_FERMO.png', dpi=500, facecolor = "#D7D1CF", bbox_inches = "tight", transparent = True)
plt.show()
No description has been provided for this image
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [26]:
#Selecting a player for which to create a visualization of all the actions
player_df0 = takeon[takeon['player_name'] == 'Lamine Yamal']
player_df0.team_name.unique()
Out[26]:
array(['Barcelona'], dtype=object)
In [27]:
#Filtering for the team of the player - useful if the player changed team
player_df = player_df0[player_df0['team_name'] == 'Barcelona']
In [28]:
# Function to format season ID into a readable format
def format_season_id(season_id):
    # Convert to integer if it's a float
    season_id = int(season_id)
    # Extract the last two digits of the year
    start_year = str(season_id - 1)[-2:]
    # Calculate the end year
    end_year = str(season_id)[-2:]
    # Format as 20/21
    formatted_season = f"{start_year}/{end_year}"
    return formatted_season

#Apply the function
player_df['formatted_season'] = player_df['season_id'].apply(format_season_id)
In [29]:
#Initializing the figure the axis in which to plot the pitch
pitch = Pitch(pitch_type='custom', pitch_width=68, pitch_length=105, goal_type='box', linewidth=1.25, line_color='#000000', 
              line_zorder=2, pitch_color='#D7D1CF')

fig, axs = pitch.grid(endnote_height=0.03, endnote_space=0, grid_width=0.88, left=0.025, title_height=0.06, title_space=0, axis=False,
                      grid_height=0.86)

fig.set_facecolor('#D7D1CF')

# plot the heatmap - darker colors = more actions originating from that square
bins = (18, 12)
cmap1 = mcolors.LinearSegmentedColormap.from_list("custom_red", ["#D7D1CF", "#FF0000"])

bs_heatmap1 = pitch.bin_statistic(player_df.x_a0, player_df.y_a0, statistic='count', bins=bins)
hm = pitch.heatmap(bs_heatmap1, ax=axs['pitch'], cmap=cmap1, zorder = 2, alpha = 0.8)

# Scatter plots for the take ons
pitch.scatter(player_df['x_a0'], player_df['y_a0'], ax=axs['pitch'], edgecolor='#000000', facecolor='#000000', s=100, alpha=1)

# Variables that store elements to use in titles
team_name = player_df['team_name'].iloc[0]
player_name = player_df['player_name'].iloc[0]
competition_ids = ', '.join(player_df['competition_id'].unique())
formatted_season = player_df['formatted_season'].iloc[0]
season_id = player_df['season_id'].iloc[0]

# Titles
fig.text(0.15, 0.99, f'{player_name} Take-ons from a standstill',
         fontsize=30, va='center', ha='left', fontfamily='SourceSansPro-SemiBold')
fig.text(0.15, 0.95, f'{competition_ids} {formatted_season}  |  Take Ons: {len(player_df)}',
         fontsize=20, va='center', ha='left')
fig.text(0.05, 0.05, 'X: @gualanodavide | Bluesky: @gualanodavide.bsky.social | Linkedin: www.linkedin.com/in/davide-gualano-a2454b187 | Newsletter: the-cutback.beehiiv.com', va='center', ha='left', fontsize=12)

# Annotation with data coordinates and offset points.
title5 = axs['title'].annotate(xy=(.635, 0), xytext=(-200, 0), textcoords="offset points", text="", 
                               size=10, color="#FFFFFF", arrowprops=dict(arrowstyle="-|>", shrinkA=0, color="#000000", linewidth=1.5))

#Adding the club logo
DC_to_FC = ax=axs['pitch'].transData.transform
FC_to_NFC = fig.transFigure.inverted().transform
# -- Take data coordinates and transform them to normalized figure coordinates
DC_to_NFC = lambda x: FC_to_NFC(DC_to_FC(x))

ax_size = 0.12
y = 62
# Get the data coordinates for the specific x and y values
data_coords = DC_to_FC((-4, (y * 1.18) - 2.25))  # This returns a tuple
ax_coords = FC_to_NFC(data_coords)  # Transform to figure coordinates

# Adjust the x-coordinate
adjusted_x = 0.04
ax_coords = (adjusted_x, ax_coords[1])  # Create new ax_coords with adjusted x

# Retrieve the team_id and team_name from the DataFrame
team_id = fb[fb['team_name'] == team_name]['fotmob_id'].iloc[0]

# Add an axis for the image
image_ax = fig.add_axes([ax_coords[0], ax_coords[1], ax_size, ax_size], fc='None', anchor='C')
fotmob_url = 'https://images.fotmob.com/image_resources/logo/teamlogo/'

try:
    player_face = Image.open(urllib.request.urlopen(f"{fotmob_url}{team_id}.png")).convert('RGBA')
    image_ax.imshow(player_face)
except Exception as e:
    print(f"Error loading image for team {team_name}: {e}")
    # If an error occurs, you might want to exit or handle it differently.
    # 'continue' is removed because it's not in a loop.

image_ax.axis("off")

# Save the figure
plt.savefig(f'{player_name}-standstilltakeons-{season_id}.png', dpi=500, facecolor="#D7D1CF", bbox_inches="tight", transparent=True)
plt.show()
No description has been provided for this image
In [ ]:
 
In [ ]:
 
In [ ]: