In [1]:
import matplotlib
import pandas as pd
import numpy as np
import warnings
import urllib
from PIL import Image
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.font_manager as fm
from matplotlib.colors import LinearSegmentedColormap
from matplotlib.patches import RegularPolygon
from mplsoccer import Pitch, VerticalPitch, lines
from scipy.ndimage import gaussian_filter
In [2]:
import socceraction
import socceraction.spadl as spadl
In [3]:
fe_regular = fm.FontEntry(
    fname='/Users/davidegualano/Documents/Python FTBLData/SourceSansPro-Regular.ttf',
    name='SourceSansPro-Regular'
)
fe_semibold = fm.FontEntry(
    fname='/Users/davidegualano/Documents/Python FTBLData/SourceSansPro-SemiBold.ttf',
    name='SourceSansPro-SemiBold'
)

fe_medium = fm.FontEntry(
    fname='/Users/davidegualano/Documents/Python FTBLData/Shentox-W01-Medium.ttf',
    name='Shentox-Medium'
)
fe_bold = fm.FontEntry(
    fname='/Users/davidegualano/Documents/Python FTBLData/Shentox-W01-Bold.ttf',
    name='Shentox-Bold'
)

# Insert both fonts into the font manager
fm.fontManager.ttflist.insert(0, fe_regular)
fm.fontManager.ttflist.insert(1, fe_semibold)
fm.fontManager.ttflist.insert(2, fe_medium)
fm.fontManager.ttflist.insert(3, fe_bold)

# Set the font family
matplotlib.rcParams['font.family'] = fe_regular.name  # Default to Regular
In [4]:
season = 2425
In [5]:
fb = pd.read_csv("teamsFOTMOB.csv", index_col = 0)
players = pd.read_csv(f"players{season}.csv", index_col = 0)
games = pd.read_csv(f"games{season}.csv", index_col = 0)
actions = pd.read_csv(f"actions{season}.csv", index_col = 0)
In [6]:
games = games[["game_id", "game_date", "competition_id", "season_id"]]
games['game_date'] = pd.to_datetime(games['game_date'])
players_info = players[['game_id', 'team_id', 'player_id', 'player_name', 'season_id', 'competition_id']]
In [7]:
actions = spadl.add_names(actions)
In [8]:
df = (
    actions
    .merge(fb, how="left")
    .merge(games, how="left")
    .merge(players_info, how="left")
)
In [9]:
df = df.sort_values(by=['game_date', 'period_id', 'time_seconds'], ascending=[True, True, True])
In [10]:
df["beginning_distance"] = np.sqrt(np.square(105-df['start_x_a0']) + np.square(34-df['start_y_a0'])).round(2)
df["end_distance"] = np.sqrt(np.square(105-df['end_x_a0']) + np.square(34-df['end_y_a0'])).round(2)
df["length"] = (df["end_distance"] - df["beginning_distance"]).abs()
df["angle"] = np.arctan2(df["end_y_a0"] - df["start_y_a0"], df["end_x_a0"] - df["start_x_a0"])
df['angle_degrees'] = np.degrees(df['angle']) % 360

df["action_distance"] = np.sqrt((df["end_x"] - df["start_x"])**2 + (df["end_y"] - df["start_y"])**2).round(2)
df['duration'] = df['time_seconds'].shift(-1) - df['time_seconds']
# Or if you prefer the actual displacement:
df["instantaneous_speed_euclid"] = df["action_distance"] / df["duration"]
In [11]:
df['instantaneous_speed_euclid'] = df['instantaneous_speed_euclid'].fillna(0)
In [12]:
df['instantaneous_speed_euclid'] = df['instantaneous_speed_euclid'].replace([float('inf'), float('-inf')], 0)
In [13]:
df['progressive'] = np.where(
    ((df['beginning_distance'] - df['end_distance']) / df['beginning_distance'] >= 0.175) & (df['length'] > 5) & 
    (((df['angle_degrees'] >= 0) & (df['angle_degrees'] <= 60)) | ((df['angle_degrees'] >= 260) & (df['angle_degrees'] <= 360))) &
    ~((df['start_x_a0'] >= 88.5) & (df['start_y_a0'] >= 13.885) & (df['start_y_a0'] <= 54.115)),
    True, False)
In [14]:
df['is_inbox'] = np.where(
    (((df['start_x_a0'] >= 88.5) & (df['start_y_a0'] >= 13.885) & (df['start_y_a0'] <= 54.115)) |
     ((df['type_name'].isin(['cross', 'pass', 'dribble'])) & (df['end_x_a0'] >= 88.5) & (df['end_y_a0'] >= 13.885) & (df['end_y_a0'] <= 54.115))),
    True, False)
In [15]:
df['is_buildup'] = np.where(
    ((df['start_x_a0'] <= 63) & ~df['type_name'].isin(['foul', 'tackle', 'interception', 'clearance'])) |
    ((df['start_x_a0'] >= 63) & df['type_name'].isin(['foul', 'tackle', 'interception', 'clearance'])),
    True, False
)
In [16]:
df['is_consolidate'] = ~df['is_buildup'] & ~df['is_inbox']
In [17]:
df['is_longball'] = np.where(
    (df['type_name'].isin(['pass', 'goalkick'])) & (df['length'] > 40),
    True, False)
In [18]:
# Shifting the columns
df["next_team_id"] = df["team_id"].shift(-1, fill_value=0)
df["next_x2_team_id"] = df["team_id"].shift(-2, fill_value=0)
df["next_type_name"] = df["type_name"].shift(-1, fill_value="")
In [19]:
def isolateChains(df):
    if df.empty:
        return df
    
    # Initialize tracking variables
    chain_team = df.iloc[0]["team_id"]
    period = df.iloc[0]["period_id"]
    stop_criterion = 0
    chain = 0
    
    # Ensure columns exist
    df["possession_chain"] = 0
    df["possession_chain_team"] = 0

    for i, row in df.iterrows():
        df.loc[i, "possession_chain"] = chain
        df.loc[i, "possession_chain_team"] = chain_team

        # Criteria for stopping possession chain
        if row["next_team_id"] != chain_team and row["next_x2_team_id"] != chain_team:
            stop_criterion += 2
        if row["type_name"] == "foul":
            stop_criterion += 2
        if row["next_type_name"] in ['freekick_short', 'throw_in', 'freekick_crossed', 'corner_crossed',
                                     'goalkick', 'shot_freekick', 'corner_short', 'shot_penalty']:
            stop_criterion += 2
        if row["type_name"] == "shot" and row["result_name"] == "success":
            stop_criterion += 2
        
        # New period -> Reset chain
        if row["period_id"] != period:
            chain += 1
            stop_criterion = 0
            chain_team = row["team_id"]
            period = row["period_id"]
            df.loc[i, "possession_chain"] = chain
            df.loc[i, "possession_chain_team"] = chain_team
        
        # If stop criterion met, start a new chain
        if stop_criterion >= 2:
            chain += 1
            stop_criterion = 0
            chain_team = row["next_team_id"]
    
    return df

# Applying function to DataFrame
df = isolateChains(df)
In [20]:
def calculate_chain_durations(df):
    # Get start and end times for each possession chain
    chain_durations = df.groupby("possession_chain")["time_seconds"].agg(["min", "max"])
    chain_durations["possession_chain_duration"] = chain_durations["max"] - chain_durations["min"]
    
    # Merge duration back into the original DataFrame
    df = df.merge(chain_durations[["possession_chain_duration"]], left_on="possession_chain", right_index=True)
    
    return df

# Apply the function
df1 = calculate_chain_durations(df)
In [21]:
def mark_regains(df):
    # Define non-regain start conditions
    non_regain_events = [
        'freekick_short', 'throw_in', 'freekick_crossed', 'corner_crossed',
        'goalkick', 'shot_freekick', 'corner_short', 'shot_penalty'
    ]

    # Ensure possession_chain has no NaN values (otherwise groupby might fail)
    df = df.copy()
    df["possession_chain"] = df["possession_chain"].fillna(-1).astype(int)  # Convert to int for safety

    # Identify the first event of each possession chain
    first_events = df.groupby("possession_chain").first().reset_index()

    # Fill NaN values to avoid errors in conditions
    first_events["type_name"] = first_events["type_name"].fillna('')
    first_events["result_name"] = first_events["result_name"].fillna('')

    # Default condition: check if the first event qualifies as a regain
    first_events["is_regain"] = ~(
        first_events["type_name"].isin(non_regain_events) |
        ((first_events["type_name"] == "shot") & (first_events["result_name"] == "success"))
    )

    # Special case: If time_seconds is 0.0, set is_regain to False
    first_events.loc[first_events["time_seconds"] == 0.0, "is_regain"] = False

    # Merge result back into original dataframe
    df = df.merge(first_events[["possession_chain", "is_regain"]], on="possession_chain", how="left")

    return df

# Apply the function
df2 = mark_regains(df1)
In [22]:
# Make a copy of df2 to avoid modifying the original DataFrame
df3 = df2.copy()

# Step 1: Track previous actions and their attributes
df3['prev_progressive'] = df3['progressive'].shift(1, fill_value=False)
df3['prev_team'] = df3['team_id'].shift(1)
df3['prev_speed'] = df3['instantaneous_speed_euclid'].shift(1)

df3['prev2_progressive'] = df3['progressive'].shift(2, fill_value=False)
df3['prev2_team'] = df3['team_id'].shift(2)
df3['prev2_speed'] = df3['instantaneous_speed_euclid'].shift(2)

df3['next_type'] = df3['type_name'].shift(-1)
df3['next_progressive'] = df3['progressive'].shift(-1, fill_value=False)
df3['next_team'] = df3['team_id'].shift(-1)
df3['next_speed'] = df3['instantaneous_speed_euclid'].shift(-1)
df3['next_is_inbox'] = df3['is_inbox'].shift(-1)

df3['next2_type'] = df3['type_name'].shift(-2)
df3['next2_progressive'] = df3['progressive'].shift(-2, fill_value=False)
df3['next2_team'] = df3['team_id'].shift(-2)
df3['next2_speed'] = df3['instantaneous_speed_euclid'].shift(-2)
df3['next2_is_inbox'] = df3['is_inbox'].shift(-2)

# Step 2: Identify sequences where either the current action,
# the previous, or the one before that is progressive,
# with same team and within the specified time windows
consecutive_progressive = df3[
    (df3['progressive']) & 
    (
        ((df3['prev_progressive']) & (df3['team_id'] == df3['prev_team']) & (df3['prev_speed'] >= 10)) | 
        ((df3['prev2_progressive']) & (df3['team_id'] == df3['prev2_team']) & (df3['prev2_speed'] >= 10))
    ) | (
        (df3['progressive']) & (
            (((df3['next_is_inbox'] == True) | (df3['next_progressive'] == True) | (df3['next_type'].isin(['shot', 'cross', 'take_on', 'bad_touch']))) 
            & (df3['team_id'] == df3['next_team']) & (df3['next_speed'] >= 10)) | 
            (((df3['next2_is_inbox'] == True) | (df3['next2_progressive'] == True) | (df3['next2_type'].isin(['shot', 'cross', 'take_on', 'bad_touch']))) 
            & (df3['team_id'] == df3['next2_team']) & (df3['next2_speed'] >= 10))
        )
    )
]


# Step 4: Create a new column 'is_transition' in the DataFrame, initially set to False
df3['is_transition'] = False

# Step 5: Mark rows that are valid end actions as transitions
df3.loc[df3.index.isin(consecutive_progressive.index), 'is_transition'] = True

# Clean up temporary columns used for checking the next row
df3.drop(columns=['prev_progressive', 'prev_team', 'prev_speed',
                  'prev2_progressive', 'prev2_team', 'prev2_speed',
                  'next_type', 'next_progressive', 'next_team', 'next_speed', 'next_is_inbox',
                  'next2_type', 'next2_progressive', 'next2_team', 'next2_speed', 'next2_is_inbox'], inplace=True)
In [23]:
#investigate a chain
df3.loc[df3["possession_chain"] == 8043][['team_name', 'instantaneous_speed_euclid', 'possession_chain', "type_name", "progressive",
                                        "is_longball", "is_regain", "is_buildup", "is_consolidate", "is_inbox", "is_transition"]]
Out[23]:
team_name instantaneous_speed_euclid possession_chain type_name progressive is_longball is_regain is_buildup is_consolidate is_inbox is_transition
26332 Salt Lake 4.341818 8043 cross False False True False True False False
26333 Salt Lake 2.916667 8043 pass False False True False True False False
26334 Salt Lake 18.640000 8043 pass False False True False False True False
26335 Salt Lake 10.780000 8043 dribble False False True False False True False
26336 Salt Lake 8.960000 8043 pass False False True False False True False
26337 Salt Lake 1.193333 8043 pass False False True False False True False
26338 Los Angeles FC 1.461429 8043 clearance True False True False True False False
In [24]:
# Filter the possession chain that ended with a shot (assuming 'df' is your DataFrame)
chain = df.loc[df["possession_chain"] == 8043]

# Get passes, dribbles, and crosses
passes = chain.loc[chain["type_name"].isin(["pass"])]
crosses = chain.loc[chain["type_name"].isin(["cross"])]
dribbles = chain.loc[chain["type_name"].isin(["dribble"])]

# Get events other than pass (excluding the last one, which could be the shot)
not_pass = chain.loc[~chain["type_name"].isin(["pass", "cross", "dribble"])]

# Set up the football pitch
pitch = Pitch(line_color='black', pitch_type='custom', pitch_length=105, pitch_width=68, line_zorder=2)
fig, ax = pitch.grid(grid_height=0.9, title_height=0.06, axis=False,
                     endnote_height=0.04, title_space=0, endnote_space=0)

# Plot passes (using arrows to represent passes)
pitch.arrows(passes.start_x_a0, passes.start_y_a0,
             passes.end_x_a0, passes.end_y_a0, color="blue", ax=ax['pitch'], zorder=3)
# Plot passes (using arrows to represent passes)
pitch.arrows(crosses.start_x_a0, crosses.start_y_a0,
             crosses.end_x_a0, crosses.end_y_a0, color="green", ax=ax['pitch'], zorder=3)
# Plot passes (using arrows to represent passes)
pitch.arrows(dribbles.start_x_a0, dribbles.start_y_a0,
             dribbles.end_x_a0, dribbles.end_y_a0, color="gray", ax=ax['pitch'], zorder=3)

# Plot non-pass events (lines for other events)
pitch.lines(not_pass.start_x_a0, not_pass.start_y_a0, not_pass.end_x_a0, not_pass.end_y_a0, color="black", lw=1.5,
            ax=ax['pitch'])

# Scatter plot for non-passing events like shots (if any)
pitch.scatter(not_pass.start_x_a0, not_pass.start_y_a0, color="black", s=100, ax=ax['pitch'])

# Display the plot
plt.show()
No description has been provided for this image
In [25]:
df3.loc[df3["possession_chain"] == 20942][['team_name', 'instantaneous_speed_euclid', 'possession_chain', "type_name", "progressive",
                                        "is_longball", "is_regain", "is_buildup", "is_consolidate", "is_inbox", "is_transition"]]
Out[25]:
team_name instantaneous_speed_euclid possession_chain type_name progressive is_longball is_regain is_buildup is_consolidate is_inbox is_transition
64264 Nashville SC 11.375 20942 pass True False True True False False True
64265 Nashville SC 15.060 20942 pass True False True True False False True
In [26]:
df4 = df3.filter(items=['game_id', 'period_id', 'time_seconds', 'team_id', 'player_id', 'type_id', 'start_x_a0', 'start_y_a0', 'end_x_a0',
                       'end_y_a0', 'instantaneous_speed_euclid', 'is_inbox', 'is_buildup', 'is_consolidate', 'is_longball',
                       'possession_chain', 'possession_chain_team', 'possession_chain_duration', 'is_regain', 'is_transition'])
df4
Out[26]:
game_id period_id time_seconds team_id player_id type_id start_x_a0 start_y_a0 end_x_a0 end_y_a0 instantaneous_speed_euclid is_inbox is_buildup is_consolidate is_longball possession_chain possession_chain_team possession_chain_duration is_regain is_transition
0 1790750 1 0.0 28925 22221.0 0 52.710 33.592 47.355 32.028 2.790000 False True False False 0 28925 48.0 False False
1 1790750 1 2.0 28925 334566.0 0 49.980 32.708 64.365 44.948 18.890000 False True False False 0 28925 48.0 False False
2 1790750 1 3.0 28925 130254.0 0 64.260 44.948 69.300 40.324 6.840000 False False True False 0 28925 48.0 False False
3 1790750 1 4.0 28925 22221.0 0 69.300 40.324 57.225 18.360 8.353333 False False True False 0 28925 48.0 False False
4 1790750 1 7.0 2947 439649.0 7 45.990 48.824 45.990 48.824 0.000000 False True False False 0 28925 48.0 False False
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
7862116 1836683 2 3113.0 108 349491.0 0 82.635 49.776 88.200 64.872 8.045000 False False True False 3948967 108 25.0 True False
7862117 1836683 2 3115.0 108 410174.0 0 88.200 64.872 69.615 57.460 6.670000 False False True False 3948967 108 25.0 True False
7862118 1836683 2 3118.0 108 421985.0 0 69.615 57.460 70.350 34.884 15.060000 False False True False 3948967 108 25.0 True False
7862119 1836683 2 3119.5 108 369715.0 21 70.350 34.884 69.510 28.220 4.480000 False False True False 3948967 108 25.0 True False
7862120 1836683 2 3121.0 108 369715.0 0 69.510 28.220 84.945 21.488 0.000000 False False True False 3948967 108 25.0 True True

7862121 rows × 20 columns

In [27]:
df4.to_csv(f"possession_chains_info{season}.csv")
In [ ]: