In [1]:
# Import necessary libraries for data manipulation, visualization, and analysis
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.font_manager as fm
from matplotlib.colors import LinearSegmentedColormap
import socceraction
import socceraction.atomic.spadl as atomicspadl
from mplsoccer import Pitch, VerticalPitch, lines
from scipy.ndimage import gaussian_filter
from PIL import Image
import urllib
In [2]:
# Load custom fonts for visualization
fe_regular = fm.FontEntry(
    fname='/Users/davidegualano/Documents/Python FTBLData/SourceSansPro-Regular.ttf',
    name='SourceSansPro-Regular'
)
fe_semibold = fm.FontEntry(
    fname='/Users/davidegualano/Documents/Python FTBLData/SourceSansPro-SemiBold.ttf',
    name='SourceSansPro-SemiBold'
)

# Insert both fonts into the font manager
fm.fontManager.ttflist.insert(0, fe_regular)
fm.fontManager.ttflist.insert(1, fe_semibold)

# Set the font family to the custom regular font
matplotlib.rcParams['font.family'] = fe_regular.name
In [3]:
# Load datasets from CSV files
fb = pd.read_csv("teamsFOTMOB.csv", index_col = 0)
players = pd.read_csv("players2425.csv", index_col = 0)
games = pd.read_csv("games2425.csv", index_col = 0)
actions = pd.read_csv("atomic_actions2425.csv", index_col = 0)
In [4]:
#Create a definition to convert minutes in seconds played for filtering reasons
def convert_to_seconds(time_str):
    try:
        # Convert to string in case it's a float (e.g., NaN)
        time_str = str(time_str)
        # Split the time string into minutes and seconds
        minutes, seconds = map(int, time_str.split(':'))
        # Convert total time to seconds (minutes converted to seconds)
        return minutes * 60 + seconds
    except (ValueError, AttributeError):
        # Handle cases where the conversion fails (e.g., NaN or bad format)
        return 0  # or use `np.nan` if you prefer to mark as missing

# Apply the conversion function to the 'minutes_played' column
players['seconds_played'] = players['minutes_played'].apply(convert_to_seconds)
players['start_second'] = players['start_second'].apply(convert_to_seconds)
players['end_second'] = players['end_second'].apply(convert_to_seconds)
In [5]:
# Add descriptive action names to the actions DataFrame
atomic = atomicspadl.add_names(actions)
# Merge events and players infos
df0 = atomic.merge(players, on=['game_id', 'team_id', 'player_id'])
# Merge events and teams names mapping
df = df0.merge(fb)
In [6]:
# Function to format season ID into a readable format
def format_season_id(season_id):
    # Convert to integer if it's a float
    season_id = int(season_id)
    # Extract the last two digits of the year
    start_year = str(season_id -1)[-2:]
    # Calculate the end year
    end_year = str(season_id)[-2:]
    # Format as 20/21
    formatted_season = f"{start_year}/{end_year}"
    return formatted_season
In [7]:
#Select relevant type of actions
df1 = df[df["type_name"].isin(['pass', 'receival', 'dribble', 'interception', 'out', 'clearance', 'take_on', 'tackle', 'shot', 'offside',
                              'bad_touch', 'keeper_pick_up', 'foul', 'keeper_save', 'cross', 'goalkick', 'keeper_claim', 'goal', 
                               'keeper_punch',  'owngoal'])]
In [8]:
#Get the whole list of players in the data
teamlist = df['team_name'].unique().tolist()
cleaned_teamlist = [name for name in teamlist  if pd.notna(name)]
cleaned_teamlist.sort()
In [9]:
from IPython.display import display, HTML

# Generate the HTML dropdown to easily search for players
options_html = ''.join([f'<option value="{name}">{name}</option>' for name in cleaned_teamlist])

dropdown_html = f"""
<input list="players" id="dropdown" oninput="handleInput()" placeholder="Choose Someone">
<datalist id="players">
    {options_html}
</datalist>
<p id="output"></p>
<script>
function handleInput() {{
    var input = document.getElementById("dropdown").value;
    var output = document.getElementById("output");
    output.innerHTML = "Selected: " + input;
}}
</script>
"""

# Display the dropdown
display(HTML(dropdown_html))

In [10]:
#Choosing team
team = 'Liverpool'
In [11]:
#Selecting team I want to explore and season id in data
df2a = df1[df1["team_name"] == team]
df2a.season_id.unique()
Out[11]:
array([2425])
In [12]:
# Apply the function to the 'season_id' column and filter for season id
df2a['formatted_season'] = df2a['season_id'].apply(format_season_id)
df2b = df2a[df2a["season_id"].isin([2425])]
/var/folders/ns/3wxdg4g57h77vxwmr4wzmvt40000gn/T/ipykernel_41061/1782193873.py:2: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2a['formatted_season'] = df2a['season_id'].apply(format_season_id)

Relative frequency offensive heatmap¶

In [13]:
#We watch the list of players for the team we wanted to explore
df6a = df2b[df2b["game_id"].isin(df2b.game_id.unique())]
df6a.player_name.unique()
Out[13]:
array(['Dominik Szoboszlai', 'Wataru Endo', 'Caoimhín Kelleher',
       'Trent Alexander-Arnold', 'Jarell Quansah', 'Darwin Núñez',
       'Harvey Elliott', 'Kostas Tsimikas', 'Rio Ngumoha', 'Diogo Jota',
       'Tyler Morton', 'Federico Chiesa', 'Conor Bradley',
       'James McConnell', 'Jayden Danns', 'Treymaurice Nyoni',
       'Joe Gomez', 'Luis Díaz', 'Isaac Mabaya', 'Trent Koné-Doherty',
       'Alexis Mac Allister', 'Virgil van Dijk', 'Ibrahima Konaté',
       'Mohamed Salah', 'Andy Robertson', 'Ryan Gravenberch',
       'Curtis Jones', 'Cody Gakpo', 'Alisson Becker', 'Vítezslav Jaros',
       'Amara Nallo'], dtype=object)
In [14]:
#Choosing player
player = 'Darwin Núñez'
In [15]:
#We filter for the player we want to visualize data
df6b = df6a[df6a["player_name"] == player]
#Select offensive actions
df6 = df6b[df6b["type_name"].isin(['pass', 'receival', 'dribble', 'out', 'take_on', 'shot', 'offside', 'bad_touch', 'keeper_pick_up',
                                   'keeper_save', 'cross', 'goalkick', 'keeper_claim', 'goal', 'keeper_punch',  'owngoal'])]
In [16]:
# Initialize an empty list to store results
results = []

# Loop through each unique game_id in df2b
for game_id in df2b['game_id'].unique():
    # Filter df3 for the current game_id
    game_data = df6[df6['game_id'] == game_id]

    if not game_data.empty:
        # Extract the necessary variables for the first row of the game_data
        start_second = game_data['start_second'].iloc[0]
        end_second = game_data['end_second'].iloc[0]
        start_period = game_data['start_period'].iloc[0]
        end_period = game_data['end_period'].iloc[0]
        
        # Filter df3a for the relevant start and end rows
        start_row = df6a[(df6a['period_id'] == start_period) & (df6a['time_seconds'] >= start_second)].head(1)
        end_row = df6a[(df6a['period_id'] == end_period) & (df6a['time_seconds'] <= end_second)].tail(1)
        
        if not start_row.empty and not end_row.empty:
            start_idx = start_row.index[0]
            end_idx = end_row.index[0]
            
            if start_idx <= end_idx:
                # Filter all rows between the identified start and end rows, inclusive
                df8 = df6a.loc[start_idx:end_idx]
                
                # Optionally add a column for game_id to keep track of which game this data belongs to
                df8['game_id'] = game_id
                
                # Append the filtered DataFrame to results
                results.append(df8)

# Concatenate all results into a single DataFrame if needed
df8 = pd.concat(results, ignore_index=True)
/var/folders/ns/3wxdg4g57h77vxwmr4wzmvt40000gn/T/ipykernel_41061/2203434804.py:29: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df8['game_id'] = game_id
In [17]:
#Definition to automatically retrieve player name and insert in the title function
nrows = df6.shape[0]
for y in range(nrows):
    title = df6['team_name'].iloc[y]
    title1 = df6['player_name'].iloc[y]
    title2 = df6['season_id'].iloc[y]

#Create pitch and axes
pitch = Pitch(pitch_type='custom', pitch_width=68, pitch_length=105, goal_type='box', linewidth=1.25, line_color='#000000', 
              line_zorder=2, pitch_color='#D7D1CF')
fig, axs = pitch.grid(endnote_height=0.03, endnote_space=0, grid_width=0.88, left=0.025, title_height=0.06, title_space=0, axis=False,
                      grid_height=0.86)
fig.set_facecolor('#D7D1CF')

# Define the colormap with #D7D1CF as the lowest value
cmap = LinearSegmentedColormap.from_list('custom_cmap', ['#D7D1CF', '#FFFFFF', '#FFFF80', '#FF8000', '#800000', '#000000'], N=256)

# Define the data to normalize the player heatmap
bin_statisticplayer = pitch.bin_statistic(df6.x_a0, df6.y_a0, statistic='count', bins=(105, 68))
bin_statisticteam = pitch.bin_statistic(df8.x_a0, df8.y_a0, statistic='count', bins=(105, 68))

# Normalize bin_statistic['statistic'] using bin_statistic2['statistic']
normalized_statistic = np.where(bin_statisticplayer['statistic'] != 0, bin_statisticplayer['statistic'] / bin_statisticteam['statistic'], 0)

# Apply Gaussian smoothing
bin_statisticplayer['statistic'] = gaussian_filter(normalized_statistic, 2.5)
pcm = pitch.heatmap(bin_statisticplayer, ax=axs['pitch'], cmap=cmap)

# endnote and title
axs['endnote'].text(0, 1, 'X: @gualanodavide | Bluesky: @gualanodavide.bsky.social | Linkedin: www.linkedin.com/in/davide-gualano-a2454b187 | Newsletter: the-cutback.beehiiv.com',
                    va='center', ha='left', fontsize=9, color='#000000')
ax_title = axs['title'].text(0.5, .9, f"{title1} relative frequency offensive seasonal heatmap", color='#000000', va='center', ha='center',
                             fontsize=20, fontfamily='SourceSansPro-SemiBold')
ax_title = axs['title'].text(0.5, .4, f"{', '.join(df6['competition_id'].unique())} {', '.join(df6['formatted_season'].unique())}",
                             color='#000000', va='center', ha='center', fontsize=12)

# Annotation with data coordinates and offset points.
title5 = axs['title'].annotate(xy=(.632, 0), xytext=(-200, 0), textcoords="offset points", text="",
    size=10, color="#000000", arrowprops=dict(arrowstyle="-|>", shrinkA=0, color="black", linewidth=1.5))

# Club logo and annotations
fotmob_url = "https://images.fotmob.com/image_resources/logo/teamlogo/"
club_icon = Image.open(urllib.request.urlopen(f"{fotmob_url}{df6['fotmob_id'].iloc[0]}.png"))

# Add the club logos to the subplot 1
logo_ax1 = fig.add_axes([0.05, 0.88, 0.1, 0.1], zorder=1)  # Adjusted to fit subplot 1
logo_ax1.imshow(club_icon)
logo_ax1.axis("off")
/var/folders/ns/3wxdg4g57h77vxwmr4wzmvt40000gn/T/ipykernel_41061/3750933195.py:23: RuntimeWarning: invalid value encountered in divide
  normalized_statistic = np.where(bin_statisticplayer['statistic'] != 0, bin_statisticplayer['statistic'] / bin_statisticteam['statistic'], 0)
Out[17]:
(-0.5, 191.5, 191.5, -0.5)
No description has been provided for this image
In [ ]:
 
In [ ]:
 
In [ ]:
 

Relative frequency (to where the opponents do their actions) player defensive heatmap¶

In [18]:
#Filtering to find defensive actions of the player
df3a = df2b[df2b["type_name"].isin(['interception', 'clearance', 'tackle', 'foul'])]
df3 = df3a[df3a["player_name"] == player]
In [19]:
df4a = df1[df1["game_id"].isin(df2b.game_id.unique())]
df4 = df4a[df4a["team_name"] != team]
In [20]:
# Initialize an empty list to store results
results = []

# Loop through each unique game_id in df2b
for game_id in df2b['game_id'].unique():
    # Filter df3 for the current game_id
    game_data = df3[df3['game_id'] == game_id]

    if not game_data.empty:
        # Extract the necessary variables for the first row of the game_data
        start_second = game_data['start_second'].iloc[0]
        end_second = game_data['end_second'].iloc[0]
        start_period = game_data['start_period'].iloc[0]
        end_period = game_data['end_period'].iloc[0]
        
        # Filter df3a for the relevant start and end rows
        start_row = df3a[(df3a['period_id'] == start_period) & (df3a['time_seconds'] >= start_second)].head(1)
        end_row = df3a[(df3a['period_id'] == end_period) & (df3a['time_seconds'] <= end_second)].tail(1)
        
        if not start_row.empty and not end_row.empty:
            start_idx = start_row.index[0]
            end_idx = end_row.index[0]
            
            if start_idx <= end_idx:
                # Filter all rows between the identified start and end rows, inclusive
                df4 = df3a.loc[start_idx:end_idx]
                
                # Optionally add a column for game_id to keep track of which game this data belongs to
                df4['game_id'] = game_id
                
                # Append the filtered DataFrame to results
                results.append(df4)

# Concatenate all results into a single DataFrame if needed
df4 = pd.concat(results, ignore_index=True)
/var/folders/ns/3wxdg4g57h77vxwmr4wzmvt40000gn/T/ipykernel_41061/3666525652.py:29: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df4['game_id'] = game_id
In [21]:
#Definition to automatically retrieve player name and insert in the title function
nrows = df3.shape[0]
for y in range(nrows):
    title = df3['team_name'].iloc[y]
    title1 = df3['player_name'].iloc[y]
    title2 = df3['season_id'].iloc[y]

#Create pitch and axes
pitch = Pitch(pitch_type='custom', pitch_width=68, pitch_length=105, goal_type='box', linewidth=1.25, line_color='#000000', 
              line_zorder=2, pitch_color='#D7D1CF')
fig, axs = pitch.grid(endnote_height=0.03, endnote_space=0, grid_width=0.88, left=0.025, title_height=0.06, title_space=0, axis=False,
                      grid_height=0.86)
fig.set_facecolor('#D7D1CF')

# Define the colormap with #D7D1CF as the lowest value
cmap = LinearSegmentedColormap.from_list('custom_cmap', ['#D7D1CF', '#FFFFFF', '#FFFF80', '#FF8000', '#800000', '#000000'], N=256)

# Define the data to normalize the player heatmap
bin_statisticplayer = pitch.bin_statistic(df3.x_a0, df3.y_a0, statistic='count', bins=(105, 68))
bin_statisticteam = pitch.bin_statistic(df4.x, df4.y, statistic='count', bins=(105, 68))

# Normalize bin_statistic['statistic'] using bin_statistic2['statistic']
normalized_statistic = np.where(bin_statisticteam['statistic'] != 0, bin_statisticplayer['statistic'] / bin_statisticteam['statistic'], 0)

# Apply Gaussian smoothing
bin_statisticplayer['statistic'] = gaussian_filter(normalized_statistic, 2.5)
pcm = pitch.heatmap(bin_statisticplayer, ax=axs['pitch'], cmap=cmap)

# endnote and title
axs['endnote'].text(0, 1, 'X: @gualanodavide | Bluesky: @gualanodavide.bsky.social | Linkedin: www.linkedin.com/in/davide-gualano-a2454b187 | Newsletter: the-cutback.beehiiv.com',
                    va='center', ha='left', fontsize=9, color='#000000')
ax_title = axs['title'].text(0.5, .9, f"{title1} relative frequency defensive seasonal heatmap", color='#000000', va='center', ha='center',
                             fontsize=20, fontfamily='SourceSansPro-SemiBold')
ax_title = axs['title'].text(0.5, .4, f"{', '.join(df3['competition_id'].unique())} {', '.join(df3['formatted_season'].unique())}",
                             color='#000000', va='center', ha='center', fontsize=12)

# Annotation with data coordinates and offset points.
title5 = axs['title'].annotate(xy=(.632, 0), xytext=(-200, 0), textcoords="offset points", text="",
    size=10, color="#000000", arrowprops=dict(arrowstyle="-|>", shrinkA=0, color="black", linewidth=1.5))

# Club logo and annotations
fotmob_url = "https://images.fotmob.com/image_resources/logo/teamlogo/"
club_icon = Image.open(urllib.request.urlopen(f"{fotmob_url}{df3['fotmob_id'].iloc[0]}.png"))

# Add the club logos to the subplot 1
logo_ax1 = fig.add_axes([0.05, 0.88, 0.1, 0.1], zorder=1)  # Adjusted to fit subplot 1
logo_ax1.imshow(club_icon)
logo_ax1.axis("off")
/var/folders/ns/3wxdg4g57h77vxwmr4wzmvt40000gn/T/ipykernel_41061/2044905807.py:23: RuntimeWarning: divide by zero encountered in divide
  normalized_statistic = np.where(bin_statisticteam['statistic'] != 0, bin_statisticplayer['statistic'] / bin_statisticteam['statistic'], 0)
/var/folders/ns/3wxdg4g57h77vxwmr4wzmvt40000gn/T/ipykernel_41061/2044905807.py:23: RuntimeWarning: invalid value encountered in divide
  normalized_statistic = np.where(bin_statisticteam['statistic'] != 0, bin_statisticplayer['statistic'] / bin_statisticteam['statistic'], 0)
Out[21]:
(-0.5, 191.5, 191.5, -0.5)
No description has been provided for this image
In [ ]:
 
In [ ]: