In [1]:
# Import necessary libraries for data manipulation, visualization, and analysis
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings("ignore")
from PIL import Image
import urllib.request
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import matplotlib.dates as mdates
import matplotlib.font_manager as fm
from tqdm import tqdm
import socceraction
import socceraction.spadl as spadl
In [2]:
# Load custom fonts for visualization
fe_regular = fm.FontEntry(
fname='/Users/davidegualano/Documents/Python FTBLData/SourceSansPro-Regular.ttf',
name='SourceSansPro-Regular'
)
fe_semibold = fm.FontEntry(
fname='/Users/davidegualano/Documents/Python FTBLData/SourceSansPro-SemiBold.ttf',
name='SourceSansPro-SemiBold'
)
# Insert both fonts into the font manager
fm.fontManager.ttflist.insert(0, fe_regular)
fm.fontManager.ttflist.insert(1, fe_semibold)
In [3]:
#Selecting season to do it one time
season = '2425'
In [4]:
# Load datasets from CSV files
games = pd.read_csv(f"games{season}.csv", index_col = 0)
actions = pd.read_csv(f"actions{season}.csv", index_col = 0)
players = pd.read_csv(f"players{season}.csv", index_col = 0)
fb = pd.read_csv("teamsFOTMOB.csv", index_col = 0)
xG = pd.read_csv("xGactions.csv", index_col = 0)
xP = pd.read_csv("xPactions.csv", index_col = 0)
In [5]:
# Extracting infos i'm interested in from dataframes
games = games[["game_id", "game_date", "competition_id", "season_id"]]
games['game_date'] = pd.to_datetime(games['game_date'])
players_info = players[['game_id', 'team_id', 'player_id', 'player_name', 'season_id', 'competition_id']]
In [6]:
# Adding infos to events
actions = spadl.add_names(actions)
In [7]:
# Merging the other dataframes to have all the infos we want on the events
df = (
actions
.merge(fb, how="left")
.merge(xG, how="left")
.merge(xP, how="left")
.merge(games, how="left")
.merge(players_info, how="left")
)
In [8]:
# Calculate movement distances and angles
df["beginning_distance"] = np.sqrt(np.square(105-df['start_x_a0']) + np.square(34-df['start_y_a0'])).round(2)
df["end_distance"] = np.sqrt(np.square(105-df['end_x_a0']) + np.square(34-df['end_y_a0'])).round(2)
df["length"] = df["end_distance"] - df["beginning_distance"]
df['length'] = df['length'].abs()
df["angle"] = np.arctan2(df["end_y_a0"] - df["start_y_a0"], df["end_x_a0"] - df["start_x_a0"])
df['angle_degrees'] = np.degrees(df['angle']) % 360
In [9]:
# Calculate which are progressive actions based on my definition
df['progressive'] = np.where(
((df['beginning_distance'] - df['end_distance']) / df['beginning_distance'] >= 0.175) & (df['length'] > 5) &
(((df['angle_degrees'] >= 0) & (df['angle_degrees'] <= 60)) | ((df['angle_degrees'] >= 260) & (df['angle_degrees'] <= 360))) &
~((df['start_x_a0'] >= 88.5) & (df['start_y_a0'] >= 13.885) & (df['start_y_a0'] <= 54.115)),
True, False)
In [10]:
# Selecting the club I want
club = 'Juventus'
In [11]:
# Filtering dataframe for the club events
df1 = df[df["team_name"] == club].reset_index(drop = True)
In [12]:
# Extracting Fotmob Id for future use
fotmob = df1.fotmob_id.iloc[0].astype(int)
In [13]:
# Keeping only progressive actions
df2 = df1[df1['progressive'] == True]
In [14]:
# Calculating progressive actions for players and game combination
FF0 = (
df2.groupby(
['player_name', 'player_id', 'game_id', 'game_date', 'season_id', 'competition_id'],
observed=True
)["type_name"]
.count()
.reset_index(name='progressive_actions')
)
In [15]:
# List of players to use for filtering later on
FF0.player_name.unique()
Out[15]:
array(['Alberto Costa', 'Andrea Cambiaso', 'Bremer', 'Danilo', 'Douglas Luiz', 'Dusan Vlahovic', 'Federico Gatti', 'Francisco Conceição', 'Jonas Rouhi', 'Juan Cabal', 'Kenan Yildiz', 'Khéphren Thuram', 'Lloyd Kelly', 'Lorenzo Anghelè', 'Manuel Locatelli', 'Mattia Perin', 'Michele Di Gregorio', 'Nicolás González', 'Nicolò Fagioli', 'Nicolò Savona', 'Pierre Kalulu', 'Randal Kolo Muani', 'Renato Veiga', 'Samuel Mbangula', 'Teun Koopmeiners', 'Timothy Weah', 'Vasilije Adzic', 'Weston McKennie'], dtype=object)
In [16]:
# Actually filtering for players I want
FF1 = FF0[FF0["player_name"].isin(['Andrea Cambiaso', 'Bremer', 'Danilo', 'Federico Gatti', 'Jonas Rouhi', 'Juan Cabal',
'Lloyd Kelly', 'Nicolò Savona', 'Pierre Kalulu', 'Renato Veiga', 'Timothy Weah',
'Weston McKennie'])].reset_index(drop = True)
In [17]:
# Summing together progressive actions for selected players in every game
FF2 = (
FF1.groupby(
['game_id', 'game_date', 'season_id', 'competition_id'],
observed=True
)["progressive_actions"]
.sum()
.reset_index(name='progressive_actions')
)
In [18]:
# Cleaning the dataframe of the Nan values and ordering it for date
FF = FF2.sort_values(by='game_date').reset_index(drop=True)
FF = FF.fillna(0)
In [19]:
# Finding totale of actions of selected slicing
SIZE = FF["progressive_actions"].sum()
In [20]:
# Compute rolling average
FF['rolling'] = (FF
.groupby(['game_id', 'game_date', 'season_id', 'competition_id'])
.rolling(window=3, min_periods=0)['progressive_actions']
.mean()
.reset_index(drop=True))
In [21]:
from scipy.signal import savgol_filter
# Define smoothing parameters
window_length = 5 # Choose an odd number close to 10% of total matches
polyorder = 2 # A quadratic fit is ideal for xG trends
# Apply smoothing
FF['smoothed'] = savgol_filter(FF['rolling'], window_length=window_length, polyorder=polyorder, mode="nearest")
In [22]:
# Function to format season ID into a readable format
def format_season_id(season_id):
# Convert to integer if it's a float
season_id = int(season_id)
# Extract the last two digits of the year
start_year = str(season_id -1)[-2:]
# Calculate the end year
end_year = str(season_id)[-2:]
# Format as 20/21
formatted_season = f"{start_year}/{end_year}"
return formatted_season
In [23]:
# Formatting the season info for visualization purposes
FF['season_id'] = FF['season_id'].apply(format_season_id)
In [24]:
#Copying dataframe of use and adjusting date columns
df = FF.copy()
df['game_date'] = pd.to_datetime(df['game_date'])
# Extracting some infos to plot them
competition_ids = ', '.join(FF['competition_id'].unique())
formatted_season = FF['season_id'].iloc[0]
# Set up the figure with mosaic layout
fig = plt.figure(figsize=(7, 4), dpi=200)
axs = fig.subplot_mosaic('DS', gridspec_kw={"width_ratios": [0.15, 0.5]}, sharey=True)
# Define colors
main_color = '#FB090B'
second_color = "#000000"
# Plot on subplot 'S' using game_date as x-axis
axs['S'].plot(df['game_date'], df['smoothed'], color=main_color, zorder=5,
marker='o', markevery=[-1], markersize=3.5,
mfc=axs['S'].get_facecolor(), lw=2, mew=1.5,
label='3 game rolling average')
axs['S'].legend(markerscale=0.75, loc='upper center', bbox_to_anchor=[0.5, 1.06],
fontsize=5, fancybox=True, framealpha=0.2)
sns.scatterplot(data=df, x='game_date', y='progressive_actions',
size='progressive_actions', alpha=0.2, color=second_color,
zorder=5, legend=False, sizes=(10, 110), ax=axs['S'],
edgecolor=second_color)
# Convert game_date to datetime format
X = pd.to_datetime(df['game_date'])
# Define padding for the x-axis (7 days before first match, 7 days after last match)
padding_days = 7
x_min = X.min() - pd.Timedelta(days=padding_days)
x_max = X.max() + pd.Timedelta(days=padding_days)
# Apply x-limits to ensure padding effect
axs['S'].set_xlim([x_min, x_max])
# Set x-ticks at game dates
axs['S'].set_xticks(X)
# Format x-tick labels (rotation + alignment)
axs['S'].set_xticklabels(X.dt.strftime('%b %d'), rotation=90, ha="right", color="#ACA7A5", fontsize=5)
# Ensure grids are only vertical (x-axis only)
axs['S'].grid(axis='x', linestyle='dashed', linewidth=0.3, color='#ACA7A5')
axs['D'].grid(axis='x', linestyle='dashed', linewidth=0.3, color='#ACA7A5')
# Disable horizontal grid lines
axs['S'].grid(axis='y', linestyle='', linewidth=0)
axs['D'].grid(axis='y', linestyle='', linewidth=0)
# Plot histogram on subplot 'D'
sns.histplot(data=df, y='progressive_actions', ax=axs['D'], element='step',
zorder=2, color=second_color, alpha=0.5, stat='count')
axs['D'].invert_xaxis()
axs['D'].spines['left'].set_visible(False)
axs['D'].spines['right'].set_visible(True)
axs['D'].yaxis.tick_right()
axs['D'].set_ylabel('')
axs['D'].set_xlabel('Number of times', color="#4E616C", size=7)
axs['D'].yaxis.set_major_locator(ticker.MultipleLocator(10))
axs['D'].xaxis.set_major_locator(ticker.MultipleLocator(2))
# Additional subplot 'S' customizations
axs['S'].set_xlabel('Match Date', color="#4E616C", size=7)
for spine in ['bottom', 'left', 'right', 'top']:
axs['S'].spines[spine].set_color('#ACA7A5')
for spine in ['bottom', 'right', 'top']:
axs['D'].spines[spine].set_color('#ACA7A5')
axs['D'].tick_params(axis='both', labelsize=8, color='#ACA7A5', labelcolor='#ACA7A5')
# Add descriptive text to the figure
fig.text(0.12, 1.02, f"{club}'s progressive actions consistency by defenders + Weah and McKennie",
ha='left', va='center', fontfamily='SourceSansPro-SemiBold', size=10.5)
fig.text(0.12, 0.96, (f"3-game Savitzky-Golay smoothed rolling average of progressive actions in {competition_ids} {formatted_season}.\n"
f"Bubbles size and height denote the amount of progressive actions for each game. Bars underline frequency of that amount."),
ha='left', va='center', size=5)
# Add an endnote at the bottom of the figure
fig.text(0.5, -0.05, "X: @gualanodavide | Bluesky: @gualanodavide.bsky.social | Linkedin: www.linkedin.com/in/davide-gualano-a2454b187 | Newsletter: the-cutback.beehiiv.com",
ha='center', fontsize=5, color="#000000")
# Add club icon image to the figure
ax_size = 0.075
image_ax = fig.add_axes([0.83, 0.98, ax_size, ax_size], fc='None')
fotmob_url = 'https://images.fotmob.com/image_resources/logo/teamlogo/'
club_icon_url = f"{fotmob_url}{fotmob}.png"
club_icon = Image.open(urllib.request.urlopen(club_icon_url))
image_ax.imshow(club_icon)
image_ax.axis('off')
# Save and show the figure
plt.savefig(f'{club}rollingmetrics.png', dpi=500, facecolor="#D7D1CF",
bbox_inches="tight", transparent=True)
plt.show()
In [ ]:
In [ ]:
In [ ]: