In [1]:
import matplotlib
import pandas as pd
import numpy as np
import warnings
import urllib
from PIL import Image
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.font_manager as fm
from matplotlib.colors import LinearSegmentedColormap
from matplotlib.patches import RegularPolygon
from mplsoccer import Pitch, VerticalPitch, lines
from scipy.ndimage import gaussian_filter
In [2]:
import socceraction
import socceraction.spadl as spadl
In [3]:
fe_regular = fm.FontEntry(
fname='/Users/davidegualano/Documents/Python FTBLData/SourceSansPro-Regular.ttf',
name='SourceSansPro-Regular'
)
fe_semibold = fm.FontEntry(
fname='/Users/davidegualano/Documents/Python FTBLData/SourceSansPro-SemiBold.ttf',
name='SourceSansPro-SemiBold'
)
fe_medium = fm.FontEntry(
fname='/Users/davidegualano/Documents/Python FTBLData/Shentox-W01-Medium.ttf',
name='Shentox-Medium'
)
fe_bold = fm.FontEntry(
fname='/Users/davidegualano/Documents/Python FTBLData/Shentox-W01-Bold.ttf',
name='Shentox-Bold'
)
# Insert both fonts into the font manager
fm.fontManager.ttflist.insert(0, fe_regular)
fm.fontManager.ttflist.insert(1, fe_semibold)
fm.fontManager.ttflist.insert(2, fe_medium)
fm.fontManager.ttflist.insert(3, fe_bold)
# Set the font family
matplotlib.rcParams['font.family'] = fe_regular.name # Default to Regular
In [4]:
season = 2425
In [5]:
fb = pd.read_csv("teamsFOTMOB.csv", index_col = 0)
players = pd.read_csv(f"players{season}.csv", index_col = 0)
games = pd.read_csv(f"games{season}.csv", index_col = 0)
actions = pd.read_csv(f"actions{season}.csv", index_col = 0)
In [6]:
games = games[["game_id", "game_date", "competition_id", "season_id"]]
games['game_date'] = pd.to_datetime(games['game_date'])
players_info = players[['game_id', 'team_id', 'player_id', 'player_name', 'season_id', 'competition_id']]
In [7]:
actions = spadl.add_names(actions)
In [8]:
df = (
actions
.merge(fb, how="left")
.merge(games, how="left")
.merge(players_info, how="left")
)
In [9]:
df = df.sort_values(by=['game_date', 'period_id', 'time_seconds'], ascending=[True, True, True])
In [10]:
df["beginning_distance"] = np.sqrt(np.square(105-df['start_x_a0']) + np.square(34-df['start_y_a0'])).round(2)
df["end_distance"] = np.sqrt(np.square(105-df['end_x_a0']) + np.square(34-df['end_y_a0'])).round(2)
df["length"] = (df["end_distance"] - df["beginning_distance"]).abs()
df["angle"] = np.arctan2(df["end_y_a0"] - df["start_y_a0"], df["end_x_a0"] - df["start_x_a0"])
df['angle_degrees'] = np.degrees(df['angle']) % 360
df["action_distance"] = np.sqrt((df["end_x"] - df["start_x"])**2 + (df["end_y"] - df["start_y"])**2).round(2)
df['duration'] = df['time_seconds'].shift(-1) - df['time_seconds']
# Or if you prefer the actual displacement:
df["instantaneous_speed_euclid"] = df["action_distance"] / df["duration"]
In [11]:
df['instantaneous_speed_euclid'] = df['instantaneous_speed_euclid'].fillna(0)
In [12]:
df['instantaneous_speed_euclid'] = df['instantaneous_speed_euclid'].replace([float('inf'), float('-inf')], 0)
In [13]:
df['progressive'] = np.where(
((df['beginning_distance'] - df['end_distance']) / df['beginning_distance'] >= 0.175) & (df['length'] > 5) &
(((df['angle_degrees'] >= 0) & (df['angle_degrees'] <= 60)) | ((df['angle_degrees'] >= 260) & (df['angle_degrees'] <= 360))) &
~((df['start_x_a0'] >= 88.5) & (df['start_y_a0'] >= 13.885) & (df['start_y_a0'] <= 54.115)),
True, False)
In [14]:
df['is_inbox'] = np.where(
(((df['start_x_a0'] >= 88.5) & (df['start_y_a0'] >= 13.885) & (df['start_y_a0'] <= 54.115)) |
((df['type_name'].isin(['cross', 'pass', 'dribble'])) & (df['end_x_a0'] >= 88.5) & (df['end_y_a0'] >= 13.885) & (df['end_y_a0'] <= 54.115))),
True, False)
In [15]:
df['is_buildup'] = np.where(
((df['start_x_a0'] <= 63) & ~df['type_name'].isin(['foul', 'tackle', 'interception', 'clearance'])) |
((df['start_x_a0'] >= 63) & df['type_name'].isin(['foul', 'tackle', 'interception', 'clearance'])),
True, False
)
In [16]:
df['is_consolidate'] = ~df['is_buildup'] & ~df['is_inbox']
In [17]:
df['is_longball'] = np.where(
(df['type_name'].isin(['pass', 'goalkick'])) & (df['length'] > 40),
True, False)
In [18]:
# Shifting the columns
df["next_team_id"] = df["team_id"].shift(-1, fill_value=0)
df["next_x2_team_id"] = df["team_id"].shift(-2, fill_value=0)
df["next_type_name"] = df["type_name"].shift(-1, fill_value="")
In [19]:
def isolateChains(df):
if df.empty:
return df
# Initialize tracking variables
chain_team = df.iloc[0]["team_id"]
period = df.iloc[0]["period_id"]
stop_criterion = 0
chain = 0
# Ensure columns exist
df["possession_chain"] = 0
df["possession_chain_team"] = 0
for i, row in df.iterrows():
df.loc[i, "possession_chain"] = chain
df.loc[i, "possession_chain_team"] = chain_team
# Criteria for stopping possession chain
if row["next_team_id"] != chain_team and row["next_x2_team_id"] != chain_team:
stop_criterion += 2
if row["type_name"] == "foul":
stop_criterion += 2
if row["next_type_name"] in ['freekick_short', 'throw_in', 'freekick_crossed', 'corner_crossed',
'goalkick', 'shot_freekick', 'corner_short', 'shot_penalty']:
stop_criterion += 2
if row["type_name"] == "shot" and row["result_name"] == "success":
stop_criterion += 2
# New period -> Reset chain
if row["period_id"] != period:
chain += 1
stop_criterion = 0
chain_team = row["team_id"]
period = row["period_id"]
df.loc[i, "possession_chain"] = chain
df.loc[i, "possession_chain_team"] = chain_team
# If stop criterion met, start a new chain
if stop_criterion >= 2:
chain += 1
stop_criterion = 0
chain_team = row["next_team_id"]
return df
# Applying function to DataFrame
df = isolateChains(df)
In [20]:
def calculate_chain_durations(df):
# Get start and end times for each possession chain
chain_durations = df.groupby("possession_chain")["time_seconds"].agg(["min", "max"])
chain_durations["possession_chain_duration"] = chain_durations["max"] - chain_durations["min"]
# Merge duration back into the original DataFrame
df = df.merge(chain_durations[["possession_chain_duration"]], left_on="possession_chain", right_index=True)
return df
# Apply the function
df1 = calculate_chain_durations(df)
In [21]:
def mark_regains(df):
# Define non-regain start conditions
non_regain_events = [
'freekick_short', 'throw_in', 'freekick_crossed', 'corner_crossed',
'goalkick', 'shot_freekick', 'corner_short', 'shot_penalty'
]
# Ensure possession_chain has no NaN values (otherwise groupby might fail)
df = df.copy()
df["possession_chain"] = df["possession_chain"].fillna(-1).astype(int) # Convert to int for safety
# Identify the first event of each possession chain
first_events = df.groupby("possession_chain").first().reset_index()
# Fill NaN values to avoid errors in conditions
first_events["type_name"] = first_events["type_name"].fillna('')
first_events["result_name"] = first_events["result_name"].fillna('')
# Default condition: check if the first event qualifies as a regain
first_events["is_regain"] = ~(
first_events["type_name"].isin(non_regain_events) |
((first_events["type_name"] == "shot") & (first_events["result_name"] == "success"))
)
# Special case: If time_seconds is 0.0, set is_regain to False
first_events.loc[first_events["time_seconds"] == 0.0, "is_regain"] = False
# Merge result back into original dataframe
df = df.merge(first_events[["possession_chain", "is_regain"]], on="possession_chain", how="left")
return df
# Apply the function
df2 = mark_regains(df1)
In [22]:
# Make a copy of df2 to avoid modifying the original DataFrame
df3 = df2.copy()
# Step 1: Track previous actions and their attributes
df3['prev_progressive'] = df3['progressive'].shift(1, fill_value=False)
df3['prev_team'] = df3['team_id'].shift(1)
df3['prev_speed'] = df3['instantaneous_speed_euclid'].shift(1)
df3['prev2_progressive'] = df3['progressive'].shift(2, fill_value=False)
df3['prev2_team'] = df3['team_id'].shift(2)
df3['prev2_speed'] = df3['instantaneous_speed_euclid'].shift(2)
df3['next_type'] = df3['type_name'].shift(-1)
df3['next_progressive'] = df3['progressive'].shift(-1, fill_value=False)
df3['next_team'] = df3['team_id'].shift(-1)
df3['next_speed'] = df3['instantaneous_speed_euclid'].shift(-1)
df3['next_is_inbox'] = df3['is_inbox'].shift(-1)
df3['next2_type'] = df3['type_name'].shift(-2)
df3['next2_progressive'] = df3['progressive'].shift(-2, fill_value=False)
df3['next2_team'] = df3['team_id'].shift(-2)
df3['next2_speed'] = df3['instantaneous_speed_euclid'].shift(-2)
df3['next2_is_inbox'] = df3['is_inbox'].shift(-2)
# Step 2: Identify sequences where either the current action,
# the previous, or the one before that is progressive,
# with same team and within the specified time windows
consecutive_progressive = df3[
(df3['progressive']) &
(
((df3['prev_progressive']) & (df3['team_id'] == df3['prev_team']) & (df3['prev_speed'] >= 10)) |
((df3['prev2_progressive']) & (df3['team_id'] == df3['prev2_team']) & (df3['prev2_speed'] >= 10))
) | (
(df3['progressive']) & (
(((df3['next_is_inbox'] == True) | (df3['next_progressive'] == True) | (df3['next_type'].isin(['shot', 'cross', 'take_on', 'bad_touch'])))
& (df3['team_id'] == df3['next_team']) & (df3['next_speed'] >= 10)) |
(((df3['next2_is_inbox'] == True) | (df3['next2_progressive'] == True) | (df3['next2_type'].isin(['shot', 'cross', 'take_on', 'bad_touch'])))
& (df3['team_id'] == df3['next2_team']) & (df3['next2_speed'] >= 10))
)
)
]
# Step 4: Create a new column 'is_transition' in the DataFrame, initially set to False
df3['is_transition'] = False
# Step 5: Mark rows that are valid end actions as transitions
df3.loc[df3.index.isin(consecutive_progressive.index), 'is_transition'] = True
# Clean up temporary columns used for checking the next row
df3.drop(columns=['prev_progressive', 'prev_team', 'prev_speed',
'prev2_progressive', 'prev2_team', 'prev2_speed',
'next_type', 'next_progressive', 'next_team', 'next_speed', 'next_is_inbox',
'next2_type', 'next2_progressive', 'next2_team', 'next2_speed', 'next2_is_inbox'], inplace=True)
In [23]:
#investigate a chain
df3.loc[df3["possession_chain"] == 8043][['team_name', 'instantaneous_speed_euclid', 'possession_chain', "type_name", "progressive",
"is_longball", "is_regain", "is_buildup", "is_consolidate", "is_inbox", "is_transition"]]
Out[23]:
| team_name | instantaneous_speed_euclid | possession_chain | type_name | progressive | is_longball | is_regain | is_buildup | is_consolidate | is_inbox | is_transition | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 26332 | Salt Lake | 4.341818 | 8043 | cross | False | False | True | False | True | False | False |
| 26333 | Salt Lake | 2.916667 | 8043 | pass | False | False | True | False | True | False | False |
| 26334 | Salt Lake | 18.640000 | 8043 | pass | False | False | True | False | False | True | False |
| 26335 | Salt Lake | 10.780000 | 8043 | dribble | False | False | True | False | False | True | False |
| 26336 | Salt Lake | 8.960000 | 8043 | pass | False | False | True | False | False | True | False |
| 26337 | Salt Lake | 1.193333 | 8043 | pass | False | False | True | False | False | True | False |
| 26338 | Los Angeles FC | 1.461429 | 8043 | clearance | True | False | True | False | True | False | False |
In [24]:
# Filter the possession chain that ended with a shot (assuming 'df' is your DataFrame)
chain = df.loc[df["possession_chain"] == 8043]
# Get passes, dribbles, and crosses
passes = chain.loc[chain["type_name"].isin(["pass"])]
crosses = chain.loc[chain["type_name"].isin(["cross"])]
dribbles = chain.loc[chain["type_name"].isin(["dribble"])]
# Get events other than pass (excluding the last one, which could be the shot)
not_pass = chain.loc[~chain["type_name"].isin(["pass", "cross", "dribble"])]
# Set up the football pitch
pitch = Pitch(line_color='black', pitch_type='custom', pitch_length=105, pitch_width=68, line_zorder=2)
fig, ax = pitch.grid(grid_height=0.9, title_height=0.06, axis=False,
endnote_height=0.04, title_space=0, endnote_space=0)
# Plot passes (using arrows to represent passes)
pitch.arrows(passes.start_x_a0, passes.start_y_a0,
passes.end_x_a0, passes.end_y_a0, color="blue", ax=ax['pitch'], zorder=3)
# Plot passes (using arrows to represent passes)
pitch.arrows(crosses.start_x_a0, crosses.start_y_a0,
crosses.end_x_a0, crosses.end_y_a0, color="green", ax=ax['pitch'], zorder=3)
# Plot passes (using arrows to represent passes)
pitch.arrows(dribbles.start_x_a0, dribbles.start_y_a0,
dribbles.end_x_a0, dribbles.end_y_a0, color="gray", ax=ax['pitch'], zorder=3)
# Plot non-pass events (lines for other events)
pitch.lines(not_pass.start_x_a0, not_pass.start_y_a0, not_pass.end_x_a0, not_pass.end_y_a0, color="black", lw=1.5,
ax=ax['pitch'])
# Scatter plot for non-passing events like shots (if any)
pitch.scatter(not_pass.start_x_a0, not_pass.start_y_a0, color="black", s=100, ax=ax['pitch'])
# Display the plot
plt.show()
In [25]:
df3.loc[df3["possession_chain"] == 20942][['team_name', 'instantaneous_speed_euclid', 'possession_chain', "type_name", "progressive",
"is_longball", "is_regain", "is_buildup", "is_consolidate", "is_inbox", "is_transition"]]
Out[25]:
| team_name | instantaneous_speed_euclid | possession_chain | type_name | progressive | is_longball | is_regain | is_buildup | is_consolidate | is_inbox | is_transition | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 64264 | Nashville SC | 11.375 | 20942 | pass | True | False | True | True | False | False | True |
| 64265 | Nashville SC | 15.060 | 20942 | pass | True | False | True | True | False | False | True |
In [26]:
df4 = df3.filter(items=['game_id', 'period_id', 'time_seconds', 'team_id', 'player_id', 'type_id', 'start_x_a0', 'start_y_a0', 'end_x_a0',
'end_y_a0', 'instantaneous_speed_euclid', 'is_inbox', 'is_buildup', 'is_consolidate', 'is_longball',
'possession_chain', 'possession_chain_team', 'possession_chain_duration', 'is_regain', 'is_transition'])
df4
Out[26]:
| game_id | period_id | time_seconds | team_id | player_id | type_id | start_x_a0 | start_y_a0 | end_x_a0 | end_y_a0 | instantaneous_speed_euclid | is_inbox | is_buildup | is_consolidate | is_longball | possession_chain | possession_chain_team | possession_chain_duration | is_regain | is_transition | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1790750 | 1 | 0.0 | 28925 | 22221.0 | 0 | 52.710 | 33.592 | 47.355 | 32.028 | 2.790000 | False | True | False | False | 0 | 28925 | 48.0 | False | False |
| 1 | 1790750 | 1 | 2.0 | 28925 | 334566.0 | 0 | 49.980 | 32.708 | 64.365 | 44.948 | 18.890000 | False | True | False | False | 0 | 28925 | 48.0 | False | False |
| 2 | 1790750 | 1 | 3.0 | 28925 | 130254.0 | 0 | 64.260 | 44.948 | 69.300 | 40.324 | 6.840000 | False | False | True | False | 0 | 28925 | 48.0 | False | False |
| 3 | 1790750 | 1 | 4.0 | 28925 | 22221.0 | 0 | 69.300 | 40.324 | 57.225 | 18.360 | 8.353333 | False | False | True | False | 0 | 28925 | 48.0 | False | False |
| 4 | 1790750 | 1 | 7.0 | 2947 | 439649.0 | 7 | 45.990 | 48.824 | 45.990 | 48.824 | 0.000000 | False | True | False | False | 0 | 28925 | 48.0 | False | False |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 7862116 | 1836683 | 2 | 3113.0 | 108 | 349491.0 | 0 | 82.635 | 49.776 | 88.200 | 64.872 | 8.045000 | False | False | True | False | 3948967 | 108 | 25.0 | True | False |
| 7862117 | 1836683 | 2 | 3115.0 | 108 | 410174.0 | 0 | 88.200 | 64.872 | 69.615 | 57.460 | 6.670000 | False | False | True | False | 3948967 | 108 | 25.0 | True | False |
| 7862118 | 1836683 | 2 | 3118.0 | 108 | 421985.0 | 0 | 69.615 | 57.460 | 70.350 | 34.884 | 15.060000 | False | False | True | False | 3948967 | 108 | 25.0 | True | False |
| 7862119 | 1836683 | 2 | 3119.5 | 108 | 369715.0 | 21 | 70.350 | 34.884 | 69.510 | 28.220 | 4.480000 | False | False | True | False | 3948967 | 108 | 25.0 | True | False |
| 7862120 | 1836683 | 2 | 3121.0 | 108 | 369715.0 | 0 | 69.510 | 28.220 | 84.945 | 21.488 | 0.000000 | False | False | True | False | 3948967 | 108 | 25.0 | True | True |
7862121 rows × 20 columns
In [27]:
df4.to_csv(f"possession_chains_info{season}.csv")
In [ ]: