In [1]:
# Import necessary libraries for data manipulation, visualization, and analysis
import socceraction
import socceraction.spadl as spadl
import matplotlib
import matplotlib.font_manager as fm
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
from matplotlib.colors import LinearSegmentedColormap
import matplotlib.patheffects as path_effects
import pandas as pd
import numpy as np
import warnings
from mplsoccer import Pitch, VerticalPitch, lines
from scipy.ndimage import gaussian_filter
from mplsoccer import Pitch, VerticalPitch, lines
from PIL import Image
import urllib
In [2]:
# Configure pandas display options and suppress specific warnings
pd.set_option('display.max_columns', None) # Ensure all columns are displayed in DataFrame outputs
warnings.simplefilter(action='ignore', category=pd.errors.PerformanceWarning) # Ignore performance warnings from pandas
warnings.filterwarnings(action="ignore", message="credentials were not supplied. open data access only") # Suppress credential-related warnings
In [3]:
# Load custom fonts for visualization
fe_regular = fm.FontEntry(
fname='/Users/davidegualano/Documents/Python FTBLData/SourceSansPro-Regular.ttf',
name='SourceSansPro-Regular'
)
fe_semibold = fm.FontEntry(
fname='/Users/davidegualano/Documents/Python FTBLData/SourceSansPro-SemiBold.ttf',
name='SourceSansPro-SemiBold'
)
# Insert both fonts into the font manager
fm.fontManager.ttflist.insert(0, fe_regular)
fm.fontManager.ttflist.insert(1, fe_semibold)
# Set the font family to the custom regular font
matplotlib.rcParams['font.family'] = fe_regular.name
In [4]:
# Load datasets from CSV files
xG = pd.read_csv("xGactions.csv", index_col=0) # Expected goals data
fb = pd.read_csv("teamsFOTMOB.csv", index_col=0) # Ids mapping from FOTMOB and Whoscored
players = pd.read_csv("players2425.csv", index_col=0) # Players data
games = pd.read_csv("games2425.csv", index_col=0) # Games data
actions = pd.read_csv("actions2425.csv", index_col=0) # Events data
In [5]:
# Select relevant columns from games dataset
games = games[["game_id", "competition_id", "season_id"]]
# Select relevant player information
players_info = players[['game_id', 'team_id', 'player_id', 'player_name', 'season_id', 'competition_id']]
# Add descriptive action names to the actions DataFrame
actions = spadl.add_names(actions)
In [6]:
# Merge datasets to create a unified DataFrame
df = (
actions
.merge(fb, how="left") # Merge mapping data
.merge(xG, how="left") # Merge expected goals data
.merge(games, how="left") # Merge games data
.merge(players_info, how="left") # Merge player information
)
In [7]:
# Calculate movement distances and angles
df["beginning_distance"] = np.sqrt(np.square(105 - df['start_x_a0']) + np.square(34 - df['start_y_a0'])).round(2)
df["end_distance"] = np.sqrt(np.square(105 - df['end_x_a0']) + np.square(34 - df['end_y_a0'])).round(2)
df["length"] = df["end_distance"] - df["beginning_distance"]
df['length'] = df['length'].abs() # Ensure length values are always positive
df["angle"] = np.arctan2(df["end_y_a0"] - df["start_y_a0"], df["end_x_a0"] - df["start_x_a0"]) # Calculate angle in radians
df['angle_degrees'] = np.degrees(df['angle']) % 360 # Convert angle to degrees and normalize it to 0-360 range
In [8]:
#We create a set of features for the successive row, so that we can filter passes that result in shots
df["next_type_name"] = df.shift(-1, fill_value=0)["type_name"]
df["next_team_name"] = df.shift(-1, fill_value=0)["team_name"]
In [9]:
df['is_inbox'] = np.where(
(
# Start coordinates NOT inside the box
((df['start_x_a0'] < 88.5) | (df['start_y_a0'] < 13.885) | (df['start_y_a0'] > 54.115)) &
# End coordinates ARE inside the box
(df['type_name'].isin(['cross', 'pass', 'dribble'])) &
(df['end_x_a0'] >= 88.5) &
(df['end_y_a0'] >= 13.885) &
(df['end_y_a0'] <= 54.115)
),
True, False)
In [10]:
# Function to format season ID into a readable format
def format_season_id(season_id):
# Convert to integer if it's a float
season_id = int(season_id)
# Extract the last two digits of the year
start_year = str(season_id - 1)[-2:]
# Calculate the end year
end_year = str(season_id)[-2:]
# Format as 20/21
formatted_season = f"{start_year}/{end_year}"
return formatted_season
In [11]:
#We select the type of action we are interested in
passes = df[(df["type_name"] == "pass") & (df["is_inbox"] == True)]
carries = df[(df["type_name"] == "dribble") & (df["is_inbox"] == True)]
In [12]:
#Get the whole list of teams in the data
playerlist = df['team_name'].unique().tolist()
cleaned_playerlist = [name for name in playerlist if pd.notna(name)]
cleaned_playerlist.sort()
In [13]:
from IPython.display import display, HTML
# Generate the HTML dropdown to easily search for players
options_html = ''.join([f'<option value="{name}">{name}</option>' for name in cleaned_playerlist])
dropdown_html = f"""
<input list="players" id="dropdown" oninput="handleInput()" placeholder="Choose Someone">
<datalist id="players">
{options_html}
</datalist>
<p id="output"></p>
<script>
function handleInput() {{
var input = document.getElementById("dropdown").value;
var output = document.getElementById("output");
output.innerHTML = "Selected: " + input;
}}
</script>
"""
# Display the dropdown
display(HTML(dropdown_html))
In [14]:
# Select a specific team
team = 'Marseille'
In [15]:
# Filter actions dataframes for the selected team
passes1 = passes[passes["team_name"] == team]
carries1 = carries[carries["team_name"] == team]
In [16]:
# Apply the function to the 'season_id' column
passes1['formatted_season'] = passes1['season_id'].apply(format_season_id)
carries1['formatted_season'] = carries1['season_id'].apply(format_season_id)
/var/folders/ns/3wxdg4g57h77vxwmr4wzmvt40000gn/T/ipykernel_19252/3557201878.py:2: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy passes1['formatted_season'] = passes1['season_id'].apply(format_season_id) /var/folders/ns/3wxdg4g57h77vxwmr4wzmvt40000gn/T/ipykernel_19252/3557201878.py:3: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy carries1['formatted_season'] = carries1['season_id'].apply(format_season_id)
In [17]:
#We divide between goals resulting and not
passes_succ = passes1[passes1["result_name"].isin(["success"])]
passes_fail = passes1[passes1["result_name"].isin(["fail"])]
#We create another dataframe for those passes that generate a shot
key_passes = passes_succ[passes_succ["next_type_name"] == 'shot']
In [18]:
# Set up gridspec figure
fig = plt.figure(figsize=(16, 12), constrained_layout=True)
gs = fig.add_gridspec(6, 6, wspace=0.1, hspace=0.1)
# Create the axes
ax1 = fig.add_subplot(gs[0:3, :3])
ax2 = fig.add_subplot(gs[0:3, 3:])
ax3 = fig.add_subplot(gs[3:, 0:3])
ax4 = fig.add_subplot(gs[3:, 3:])
#Create the pitches
pitch1 = VerticalPitch(pitch_type='custom', pitch_width=68, pitch_length=105, half=True, pad_top=0.4, goal_type='box',
linewidth=1.25, line_color='#000000')
pitch2 = VerticalPitch(pitch_type='custom', pitch_width=68, pitch_length=105, half=True, pad_top=0.4, goal_type='box',
linewidth=1.25, line_color='#000000')
pitch1.draw(ax=ax1)
pitch2.draw(ax=ax2)
# plot the heatmap - darker colors = more actions originating from that square
bins = (18, 12)
cmap1 = mcolors.LinearSegmentedColormap.from_list("custom_red", ["#D7D1CF", "#FF0000"])
cmap2 = mcolors.LinearSegmentedColormap.from_list("custom_red", ["#D7D1CF", "#FF0000"])
bs_heatmap1 = pitch1.bin_statistic(passes1.start_x_a0, passes1.start_y_a0, statistic='count', bins=bins)
hm = pitch1.heatmap(bs_heatmap1, ax=ax1, cmap=cmap1, zorder = 1, alpha = 0.8)
bs_heatmap2 = pitch1.bin_statistic(carries1.start_x_a0, carries1.start_y_a0, statistic='count', bins=bins)
hm = pitch2.heatmap(bs_heatmap2, ax=ax2, cmap=cmap2, zorder = 1, alpha = 0.8)
# Scatter plots and arrows for the passes and carries in the two pitches
pitch1.arrows(key_passes.start_x_a0, key_passes.start_y_a0, key_passes.end_x_a0, key_passes.end_y_a0, width=1, alpha = 0.5, zorder = 1,
headwidth = 10, headlength = 8, color = '#000000', ax=ax1)
pitch1.scatter(passes_succ.start_x_a0, passes_succ.start_y_a0, c='#000000', marker='o', s=50, ax=ax1, zorder=2, ec='#000000')
pitch1.scatter(passes_fail.start_x_a0, passes_fail.start_y_a0, c='#FFFFFF', marker='o', s=50, ax=ax1, zorder=1, ec='#000000')
pitch2.plot([carries1.start_x_a0, carries1.end_x_a0],
[carries1.start_y_a0, carries1.end_y_a0],
linestyle='--', linewidth=1, color='#000000', markersize=2,
zorder=2, ax=ax2)
pitch2.scatter(carries1.end_x_a0, carries1.end_y_a0, c='#000000', s=50, ax=ax2, zorder=1, alpha=0.5)
# For ax3 - Pass distribution by player (top 5)
pass_players = passes1['player_name'].value_counts().head(8) # Get only top 5
y_pos = np.arange(len(pass_players))
# Create horizontal bars with custom styling
bars3 = ax3.barh(y_pos, pass_players.values, color='#1565C0', edgecolor='black', height=0.6)
ax3.set_yticks(y_pos)
ax3.set_yticklabels(pass_players.index)
# Generate appropriate x-ticks
max_value = pass_players.values.max()
step = max(1, max_value // 5) # Divide range into approximately 5 steps
x_ticks = np.arange(0, max_value + step, step)
ax3.set_xticks(x_ticks)
# Add dotted lines from the x-ticks vertically
for x in x_ticks:
ax3.plot([x, x], [-0.5, len(pass_players) - 0.5], 'k:', alpha=0.3, zorder=0)
# Add value labels to the end of each bar with path effects
for i, v in enumerate(pass_players.values):
text = ax3.text(2, i, str(v), va='center', fontsize=15, color='white')
# Add path effects for the edge
text.set_path_effects([path_effects.Stroke(linewidth=3, foreground='#000000'),
path_effects.Normal()])
# Invert the y-axis
ax3.invert_yaxis()
# Remove spines (borders) from bottom, top, and right sides
ax3.spines['right'].set_visible(False)
ax3.spines['left'].set_visible(False)
ax3.spines['top'].set_visible(False)
ax3.spines['bottom'].set_visible(False)
# For ax4 - Carry distribution by player (top 5)
carries_players = carries1['player_name'].value_counts().head(8) # Get only top 5
y_pos = np.arange(len(carries_players))
# Create horizontal bars with custom styling
bars4 = ax4.barh(y_pos, carries_players.values, color='#1565C0', edgecolor='black', height=0.6)
ax4.set_yticks(y_pos)
ax4.set_yticklabels(carries_players.index)
# Generate appropriate x-ticks
max_value = carries_players.values.max()
step = max(1, max_value // 5) # Divide range into approximately 5 steps
x_ticks = np.arange(0, max_value + step, step)
ax4.set_xticks(x_ticks)
# Add dotted lines from the x-ticks vertically
for x in x_ticks:
ax4.plot([x, x], [-0.5, len(carries_players) - 0.5], 'k:', alpha=0.3, zorder=0)
# Add value labels to the end of each bar with path effects
for i, v in enumerate(carries_players.values):
text = ax4.text(2, i, str(v), va='center', fontsize=15, color='white')
# Add path effects for the edge
text.set_path_effects([path_effects.Stroke(linewidth=3, foreground='#000000'),
path_effects.Normal()])
# Invert the y-axis
ax4.invert_yaxis()
# Remove spines (borders) from bottom, top, and right sides
ax4.spines['right'].set_visible(False)
ax4.spines['left'].set_visible(False)
ax4.spines['top'].set_visible(False)
ax4.spines['bottom'].set_visible(False)
# Titles for both subplots
team_name = passes1['team_name'].iloc[0]
competition_ids = ', '.join(passes1['competition_id'].unique())
formatted_season = passes1['formatted_season'].iloc[0]
season_id = passes1['season_id'].iloc[0]
# Titles
ax1.text(0.5, 1.05,
f"Key Passes : Arrows ({len(key_passes)}) | Successful : Black ({(passes_succ.shape[0])}) | Unsuccessful : White ({passes_fail.shape[0]})",
color='#000000', va='center', ha='center', fontsize=11, transform=ax1.transAxes)
ax1.text(0.5, 1.1, f"Passes into box", color='#000000',
va='center', ha='center', fontsize=11, transform=ax1.transAxes)
ax2.text(1.74, 1.05, f"Carries into box", color='#000000',
va='center', ha='center', fontsize=11, transform=ax1.transAxes)
fig.text(0.07, 1.06, f'{team_name} box entries',
fontsize=30, va='center', ha='left', fontfamily='SourceSansPro-SemiBold')
fig.text(0.07, 1.03, f'{competition_ids} {formatted_season} | Heatmap: Starting Coordinates',
fontsize=20, va='center', ha='left')
fig.text(0, -0.05, 'X: @gualanodavide | Bluesky: @gualanodavide.bsky.social | Linkedin: www.linkedin.com/in/davide-gualano-a2454b187 | Newsletter: the-cutback.beehiiv.com', va='center', ha='left', fontsize=12)
#Adding the club logo
DC_to_FC = ax1.transData.transform
FC_to_NFC = fig.transFigure.inverted().transform
# -- Take data coordinates and transform them to normalized figure coordinates
DC_to_NFC = lambda x: FC_to_NFC(DC_to_FC(x))
ax_size = 0.06
y = 108
# Get the data coordinates for the specific x and y values
data_coords = DC_to_FC((-4, (y * 1.18) - 2.25)) # This returns a tuple
ax_coords = FC_to_NFC(data_coords) # Transform to figure coordinates
# Adjust the x-coordinate
adjusted_x = 0
ax_coords = (adjusted_x, ax_coords[1]) # Create new ax_coords with adjusted x
# Retrieve the team_id and team_name from the DataFrame
team_id = fb[fb['team_name'] == team]['fotmob_id'].iloc[0]
team_name = team # Team name is already defined
# Add an axis for the image
image_ax = fig.add_axes([ax_coords[0], ax_coords[1], ax_size, ax_size], fc='None', anchor='C')
fotmob_url = 'https://images.fotmob.com/image_resources/logo/teamlogo/'
try:
player_face = Image.open(urllib.request.urlopen(f"{fotmob_url}{team_id}.png")).convert('RGBA')
image_ax.imshow(player_face)
except Exception as e:
print(f"Error loading image for team {team_name}: {e}")
# If an error occurs, you might want to exit or handle it differently.
# 'continue' is removed because it's not in a loop.
image_ax.axis("off")
# Save the figure
plt.savefig(f'{team_name}-boxentries-{season_id}.png', dpi=500, facecolor="#D7D1CF", bbox_inches="tight", transparent=True)
plt.show()
In [ ]:
In [ ]: