cricket-insights-v03
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
# 1. SETUP AND CLEANING
# ---------------------------------------------------------
# Set visual style
sns.set_theme(style="whitegrid")
plt.rcParams["figure.figsize"] = (12, 8)
# Load data - ensure your file is named exactly like this
df = pd.read_csv('batting_stats.csv')
# Helper to clean the "count(percentage%)" strings
def extract_pct(val):
if isinstance(val, str):
match = re.search(r'\(([\d\.]+)\%\)', val)
return float(match.group(1)) if match else 0.0
return val
# Clean percentage columns
for col in ['DID NOT BAT(%)', 'GAMES WON(%)', 'GAMES LOST(%)', 'GAMES DRAWN(%)']:
df[col + '_VAL'] = df[col].apply(extract_pct)
# Minimum 5 innings for meaningful averages
q_df = df[df['INNS'] >= 5].copy()
# 2. ADDITIONAL METRICS
# ---------------------------------------------------------
# Scoring speed vs Reliability
avg_mid = q_df['AVG'].median()
sr_mid = q_df['STRIKE RATE'].median()
def get_role(row):
if row['AVG'] >= avg_mid and row['STRIKE RATE'] >= sr_mid: return 'Star'
if row['AVG'] >= avg_mid: return 'Anchor'
if row['STRIKE RATE'] >= sr_mid: return 'Aggressor'
return 'Lower Order/Struggler'
q_df['Role'] = q_df.apply(get_role, axis=1)
# Boundary reliance
q_df['Boundaries'] = (q_df['4s'] * 4) + (q_df['6s'] * 6)
q_df['Boundary_%'] = (q_df['Boundaries'] / q_df['RUNS']) * 100
# 3. NEW VISUALIZATIONS
# ---------------------------------------------------------
# --- Chart 1: Top 10 Scorers (Fixed Warning) ---
import matplotlib.pyplot as plt
import seaborn as sns
# Ensure the figure is clean
plt.close('all')
plt.figure(figsize=(10, 6))
top_10 = df.sort_values('RUNS', ascending=False).head(10)
# The 'hue' is set to 'Player' to satisfy new Seaborn requirements
sns.barplot(data=top_10, x='RUNS', y='Player', hue='Player', palette='viridis', legend=False)
plt.title('Season Heavyweights: Top 10 Run Scorers', fontsize=14, pad=15)
plt.xlabel('Total Runs Scored')
plt.ylabel('Player Name')
plt.grid(axis='x', linestyle='--', alpha=0.7)
plt.show()
This scatter plot is essentially a "Squad DNA Map." It breaks the team down into four strategic zones based on the team's median performance.
Here is the professional inference you can add to your report:
Inference: Strategic Player DNA Analysis This chart visualizes the balance between Reliability (Batting Average) and Aggression (Strike Rate). The grey dashed lines represent the squad's median performance, dividing the players into four distinct quadrants:
- The Four Performance Zones
Top-Right (Stars): These are your "Peak Performers." They score faster than the team average and stay at the crease longer. Players like Cameron Heard and Nicky Kirkwood sit here, representing your most dangerous assets.
Top-Left (Anchors): High average but lower strike rate. These players, such as Hassan Azad, are the "glue" of the innings. They provide stability and allow the more aggressive players to take risks at the other end.
Bottom-Right (Aggressors): These players score quickly but have lower averages. They are high-impact "finishers" or pinch-hitters who can change the momentum of a game in just a few overs.
Bottom-Left (Developing/Strugglers): Players in this zone are currently performing below the team's median in both categories. This identifies a need for either more consistent scoring or an increase in intent.
- Bubble Size (Total Volume of Runs)
The size of the dot represents the total runs scored. A large dot (like Hassan Azad) high up in the "Anchor" or "Star" zones indicates a player who has carried a significant portion of the team's scoring load this season.
Smaller dots in the "Star" zone represent high-potential players who are performing brilliantly but perhaps haven't had as much time at the crease yet.
- Strategic Coaching Takeaway
Versatility: A healthy squad should have dots scattered across the top three quadrants. If all your dots were in the Top-Left, the team would be too slow; if all were in the Bottom-Right, the team would be too "fragile."
The Gap: Look for empty spaces. For example, if the Top-Right "Star" zone is empty, the team relies too heavily on individuals either just scoring slow or getting out fast, without anyone doing both well.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
# Ensure the figure is clean
plt.close('all')
# 1. Setup the figure size for better spacing
plt.figure(figsize=(16, 10))
# 2. Create the scatter plot
sns.scatterplot(data=q_df, x='STRIKE RATE', y='AVG', hue='Role',
size='RUNS', sizes=(100, 1000), alpha=0.6)
# 3. Add the "Quadrant" lines (Middle points)
plt.axvline(q_df['STRIKE RATE'].median(), ls='--', color='gray', alpha=0.5)
plt.axhline(q_df['AVG'].median(), ls='--', color='gray', alpha=0.5)
# 4. Loop through EVERY player to add their name
# We add a small offset to the X and Y coordinates so the text doesn't sit directly on the dot
for i, row in q_df.iterrows():
plt.text(row['STRIKE RATE'] + 0.8, row['AVG'] + 0.3, row['Player'],
fontsize=8, alpha=0.9, verticalalignment='center')
plt.title('Complete Squad DNA: Individual Player Analysis', fontsize=16, pad=20)
plt.xlabel('Strike Rate (Scoring Speed)')
plt.ylabel('Batting Average (Reliability)')
# Move the legend outside to make more room for labels
plt.legend(bbox_to_anchor=(1.02, 1), loc='upper left', borderaxespad=0.)
plt.tight_layout()
plt.show()
# --- Chart 3: Dismissal Analysis (Heatmap) ---
# This shows who is susceptible to what type of dismissal
dismissal_types = ['BOWLED', 'CAUGHT', 'LBW', 'STUMPED', 'RUN OUT']
top_players_dismissals = q_df.sort_values('RUNS', ascending=False).head(10).set_index('Player')[dismissal_types]
plt.figure(figsize=(10, 6))
sns.heatmap(top_players_dismissals, annot=True, cmap='YlGnBu', fmt='g')
plt.title('Opposition Scouting: How the Top Scorers get out')
plt.show()
# --- Chart 4: Boundary Hitting vs Consistency ---
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
# Ensure you have your data loaded and processed as q_df
# ---------------------------------------------------------
# 1. Prepare the player names for the table
role_order = ['Star', 'Anchor', 'Aggressor', 'Struggler']
table_columns = []
max_players = 0
for role in role_order:
# Get names and sort them by Boundary % for a better look
names = q_df[q_df['Role'] == role].sort_values('Boundary_%', ascending=False)['Player'].tolist()
table_columns.append(names)
if len(names) > max_players:
max_players = len(names)
# Padding lists so they are the same length for the table
formatted_table = []
for names in table_columns:
padded = names + [''] * (max_players - len(names))
formatted_table.append(padded)
# Transpose for the table display
table_rows = list(zip(*formatted_table))
# 2. Create the Plot
plt.close('all')
fig, ax = plt.subplots(figsize=(14, 12))
# FIX: Added 'hue' and used 'ax.legend_.remove()' to avoid the warning/error
sns.boxplot(
data=q_df,
x='Role',
y='Boundary_%',
order=role_order,
hue='Role',
palette='Set2'
)
# Remove the legend that 'hue' creates automatically
if ax.legend_:
ax.legend_.remove()
# 3. Add the Data Table
the_table = plt.table(
cellText=table_rows,
colLabels=role_order,
loc='bottom',
bbox=[0, -0.7, 1, 0.6] # Position and size adjustment
)
# Styling the table
the_table.auto_set_font_size(False)
the_table.set_fontsize(9)
# Adjust plot area to prevent overlap
plt.subplots_adjust(bottom=0.45)
plt.title('Boundary Reliance by Player Role (Detailed Breakdown)', fontsize=16, pad=30)
plt.ylabel('% of Runs from Boundaries')
plt.xlabel('') # Removing x-label as the table headers cover it
plt.show()
# 4. FINAL SUMMARY
# ---------------------------------------------------------
print("--- STRATEGIC INFERENCES ---")
print(f"Top Performer: {top_10.iloc[0]['Player']} is the foundation of the team.")
print(f"Danger Man: {q_df.sort_values('STRIKE RATE', ascending=False).iloc[0]['Player']} has the highest Strike Rate.")
print(f"Technical Focus: Most common dismissal is '{df[dismissal_types].sum().idxmax()}'. Check training drills here.")
--- STRATEGIC INFERENCES --- Top Performer: Hassan Azad is the foundation of the team. Danger Man: Joshua Malkani has the highest Strike Rate. Technical Focus: Most common dismissal is 'CAUGHT'. Check training drills here.
## 1. The Quadrant Drill-Down (Facet Grid)
### Instead of one busy chart, this separates players into four distinct windows.
###This is great for a coach to see exactly how many players are in each "bucket" without they overlapping.
####: It allows you to see the "density" of your squad.
###If the Struggler window is crowded, it shows a need for technical training.
###If the Star window is crowded, the team is in a very strong position.
# CHUNK 1: Facet Grid Analysis
# ---------------------------------------------------------
g = sns.FacetGrid(q_df, col="Role", hue="Role", col_wrap=2, height=4, aspect=1.2)
g.map(sns.scatterplot, "STRIKE RATE", "AVG", s=100, alpha=0.7)
# Add reference lines to each subplot
for ax in g.axes.flat:
ax.axvline(sr_mid, ls='--', color='gray', alpha=0.3)
ax.axhline(avg_mid, ls='--', color='gray', alpha=0.3)
g.set_axis_labels("Strike Rate (Speed)", "Average (Consistency)")
g.fig.suptitle("Squad Segmentation: Identifying Strengths by Role", y=1.05)
plt.show()
##2. The Density & Concentration View (KDE Plot)
##This chart removes the individual names and shows "heat zones."
###It’s a professional way to see where the "average" player on your team sits.
##Inference: The "peaks" (darker areas) show where most of your players are performing.
###If the peak is in the bottom-left, the team's overall batting style is conservative and perhaps underperforming.
##You want to see the heat moving toward the top-right over the course of a season.
# CHUNK 2: Density Analysis
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
# Load and clean data
df = pd.read_csv('batting_stats.csv')
q_df = df[df['INNS'] >= 5].copy()
# Median values for quadrants
avg_mid = q_df['AVG'].median()
sr_mid = q_df['STRIKE RATE'].median()
# Create the plot
plt.close('all')
fig, ax = plt.subplots(figsize=(12, 8))
# KDE Plot
sns.kdeplot(data=q_df, x='STRIKE RATE', y='AVG', fill=True, thresh=0.05, levels=15, cmap="mako", alpha=0.8, ax=ax)
# Scatter points for players
sns.scatterplot(data=q_df, x='STRIKE RATE', y='AVG', color='white', s=30, alpha=0.6, ax=ax)
# Quadrant lines
ax.axvline(sr_mid, color='red', linestyle='--', alpha=0.5, label=f'Avg Strike Rate ({sr_mid:.1f})')
ax.axhline(avg_mid, color='red', linestyle='--', alpha=0.5, label=f'Avg Average ({avg_mid:.1f})')
# Annotate zones for the user
ax.text(sr_mid + 5, avg_mid + 5, "THE ELITE ZONE\n(High Average, High Speed)", color='white', weight='bold', fontsize=10)
ax.text(sr_mid - 25, avg_mid - 15, "THE SQUAD HUB\n(Most players concentrate here)", color='white', weight='bold', fontsize=10)
plt.title('Squad Performance Density (The "Heat Map" of Batting)', fontsize=16, pad=20)
plt.xlabel('Strike Rate (How fast they score)')
plt.ylabel('Batting Average (How long they stay in)')
# Table logic - Identifying players in the densest area
# Density is high near the median. Let's find players close to (sr_mid, avg_mid)
q_df['dist_to_center'] = np.sqrt((q_df['STRIKE RATE'] - sr_mid)**2 + (q_df['AVG'] - avg_mid)**2)
hub_players = q_df.sort_values('dist_to_center').head(10)[['Player', 'AVG', 'STRIKE RATE']]
# Formatting the table for display
table_rows = hub_players.values.tolist()
table_cols = ['Player Name', 'Average', 'Strike Rate']
the_table = plt.table(cellText=table_rows,
colLabels=table_cols,
loc='right',
bbox=[1.05, 0, 0.4, 0.8])
plt.subplots_adjust(right=0.7)
plt.savefig('kde_explained_with_table.png')
# CHUNK 3: Role-wise Reliability
# ---------------------------------------------------------
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns # Corrected from 'import sns'
# 1. Load the data
# Make sure the file name matches your saved file exactly
df = pd.read_csv('batting_stats.csv')
# 2. Filter for regular players (at least 5 innings)
q_df = df[df['INNS'] >= 5].copy()
# 3. Calculate the team medians to define the "Roles"
avg_mid = q_df['AVG'].median()
sr_mid = q_df['STRIKE RATE'].median()
# Function to assign a role based on performance vs team average
def get_role(row):
if row['AVG'] >= avg_mid and row['STRIKE RATE'] >= sr_mid: return 'Star'
if row['AVG'] >= avg_mid: return 'Anchor'
if row['STRIKE RATE'] >= sr_mid: return 'Aggressor'
return 'Struggler'
# Create the 'Role' column
q_df['Role'] = q_df.apply(get_role, axis=1)
# 4. Create the Role-wise Reliability Chart
# ---------------------------------------------------------
# We use 'hue' to color the bars by role for better visual separation
fig, axes = plt.subplots(1, 2, figsize=(15, 6))
# Left Plot: Consistency (Average)
sns.boxplot(ax=axes[0], data=q_df, x='Role', y='AVG', hue='Role', palette='Set2')
axes[0].set_title('Consistency (Average) by Role', fontsize=14, pad=10)
# Right Plot: Aggression (Strike Rate)
sns.boxplot(ax=axes[1], data=q_df, x='Role', y='STRIKE RATE', hue='Role', palette='Set2')
axes[1].set_title('Aggression (Strike Rate) by Role', fontsize=14, pad=10)
# Clean up: Remove redundant legends from inside the charts
for ax in axes:
if ax.get_legend():
ax.get_legend().remove()
plt.tight_layout()
plt.show()
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
import textwrap
# 1. Data Preparation
# ---------------------------------------------------------
df = pd.read_csv('batting_stats.csv')
q_df = df[df['INNS'] >= 5].copy()
q_df['BOUNDARY_RUNS'] = (q_df['4s'] * 4) + (q_df['6s'] * 6)
# Strategic descriptions
descriptions = {
'Hassan Azad': ["The Backbone", "Stretches far toward Average and Team %. He is the most vital player for the team's stability."],
'Cameron Heard': ["The Powerhouse", "Stretches far toward Strike Rate and Boundary Runs. He is your most dangerous 'big hitter'."],
'Nicky Kirkwood': ["The All-Rounder", "A very balanced, large shape. He scores boundaries and maintains a solid average."],
'Johan Cronje': ["The Aggressor", "Strong toward Strike Rate. He keeps the scoreboard moving at a high pace."],
'Amay Zindal': ["The Graftsman", "A smaller, more controlled shape focusing on playing the long game rather than pure power."]
}
# Select Top 5 scorers
radar_df = q_df.sort_values('RUNS', ascending=False).head(5).copy()
metrics = ['AVG', 'STRIKE RATE', 'BOUNDARY_RUNS', 'TEAM RUNS(%)']
scaler = MinMaxScaler()
radar_data_scaled = scaler.fit_transform(radar_df[metrics])
# 2. Setup Plot
# ---------------------------------------------------------
plt.close('all')
fig = plt.figure(figsize=(14, 14)) # Slightly taller to accommodate the table
ax = fig.add_subplot(111, polar=True)
angles = np.linspace(0, 2*np.pi, len(metrics), endpoint=False).tolist()
angles += angles[:1]
for i, (index, row) in enumerate(radar_df.iterrows()):
values = radar_data_scaled[i].tolist()
values += values[:1]
ax.plot(angles, values, label=row['Player'], linewidth=2)
ax.fill(angles, values, alpha=0.1)
ax.set_theta_offset(np.pi / 2)
ax.set_theta_direction(-1)
ax.set_thetagrids(np.degrees(angles[:-1]), metrics)
# 3. Formatted Data Table
# ---------------------------------------------------------
table_data = []
for p in radar_df['Player']:
style_info = descriptions.get(p, ["N/A", "N/A"])
# Wrap text to 50 characters to prevent the table from becoming too wide
wrapped_strength = "\n".join(textwrap.wrap(style_info[1], width=50))
table_data.append([p, style_info[0], wrapped_strength])
# Create the table with specific formatting
the_table = plt.table(cellText=table_data,
colLabels=["Player Name", "Style", "Strategic Insight"],
loc='bottom',
cellLoc='left',
bbox=[0.0, -0.75, 1.0, 0.55]) # Full width, positioned below axis
# Professional Table Styling
the_table.auto_set_font_size(False)
the_table.set_fontsize(10)
the_table.scale(1, 2.5) # Increase row height for better legibility
# Bold the headers
for (row, col), cell in the_table.get_celld().items():
if row == 0:
cell.set_text_props(weight='bold', color='white')
cell.set_facecolor('#40466e') # Dark blue header
# Final Layout Adjustments
plt.subplots_adjust(bottom=0.4)
plt.title('Player DNA Comparison: Top 5 Scorers', y=1.1, fontsize=16, weight='bold')
plt.legend(loc='upper right', bbox_to_anchor=(1.2, 1.1))
plt.show()
Why this code is valuable for your report: The Batting Index: This provides a single, objective number to settle debates about who is the "best" batter. It rewards players who balance speed and longevity perfectly.
The Visual Story: By seeing the dots move over the course of a season, you can track if your "Aggressors" are becoming "Stars" (improving their average) or if your "Anchors" are becoming more aggressive.
Labeling: Every player is named, making it a great discussion point for team meetings where players can see their own "DNA" compared to the rest of the squad.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
# 1. LOAD DATA
# Ensure your file name matches exactly (e.g., 'batting_stats.csv')
df = pd.read_csv('batting_stats.csv')
# 2. ANALYSIS LOGIC
# Filter for players with at least 5 innings
q_df = df[df['INNS'] >= 5].copy()
# Calculate Medians for Quadrants
avg_mid = q_df['AVG'].median()
sr_mid = q_df['STRIKE RATE'].median()
# Define Roles
def get_role(row):
if row['AVG'] >= avg_mid and row['STRIKE RATE'] >= sr_mid: return 'Star'
if row['AVG'] >= avg_mid: return 'Anchor'
if row['STRIKE RATE'] >= sr_mid: return 'Aggressor'
return 'Struggler'
q_df['Role'] = q_df.apply(get_role, axis=1)
# Advanced KPI: Batting Index (Efficiency)
q_df['Batting Index'] = q_df['AVG'] * q_df['STRIKE RATE']
# 3. CREATE THE VISUALIZATION
plt.figure(figsize=(15, 10))
# Scatter plot: Size of dot = Total Runs
sns.scatterplot(
data=q_df, x='STRIKE RATE', y='AVG',
hue='Role', size='RUNS', sizes=(100, 1000),
alpha=0.6, palette='Set2'
)
# Add Median Reference Lines
plt.axvline(sr_mid, ls='--', color='gray', alpha=0.5)
plt.axhline(avg_mid, ls='--', color='gray', alpha=0.5)
# Label Every Player
for i, row in q_df.iterrows():
plt.text(row['STRIKE RATE'] + 1, row['AVG'] + 0.5, row['Player'],
fontsize=9, alpha=0.8)
# Add Quadrant Labels for Clarity
plt.text(sr_mid + 5, q_df['AVG'].max(), "STARS: Elite Match-Winners", color='green', weight='bold')
plt.text(q_df['STRIKE RATE'].min(), q_df['AVG'].max(), "ANCHORS: Innings Stability", color='blue', weight='bold')
plt.title('Squad Strategy Map: Where every player fits', fontsize=16, pad=20)
plt.xlabel('Strike Rate (How fast they score)')
plt.ylabel('Batting Average (How long they stay in)')
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()
# 4. OUTPUT THE RANKINGS
print("\n--- Top 10 Players by Batting Index (Efficiency) ---")
print(q_df[['Player', 'Role', 'Batting Index']].sort_values('Batting Index', ascending=False).head(10))
--- Top 10 Players by Batting Index (Efficiency) ---
Player Role Batting Index
2 Cameron Heard Star 4974.4125
10 Joshua Malkani Star 4157.0146
29 Kieran Brockelsby Star 3903.8475
27 Gregg Pepler Star 3802.5000
0 Hassan Azad Star 3706.6884
1 Nicky Kirkwood Star 3415.5450
3 Johan Cronje Star 3112.9285
9 Karl Glendenning Star 2649.2185
6 Huzaif Latif Star 2449.4712
20 Caleb Pasculli Star 2388.1199
Adding Boundary Frequency and Dot Ball % (or its proxy, Strike Rotation) takes your analysis to a professional scouting level. These "hidden" metrics explain why a player has a certain Strike Rate.
# 1. Advanced Calculations
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
# 1. Load and Filter
df = pd.read_csv('batting_stats.csv')
q_df = df[df['INNS'] >= 5].copy()
# 2. Calculate Derived Metrics (Fixing the missing 'BF' and 'BOUNDARY_RUNS')
# Strike Rate = (Runs / Balls) * 100 -> Therefore, Balls = (Runs / SR) * 100
q_df['Calculated_Balls'] = (q_df['RUNS'] / q_df['STRIKE RATE']) * 100
# Calculate Boundary Runs
q_df['BOUNDARY_RUNS'] = (q_df['4s'] * 4) + (q_df['6s'] * 6)
# Define Roles (Star, Anchor, Aggressor, Struggler)
avg_mid = q_df['AVG'].median()
sr_mid = q_df['STRIKE RATE'].median()
def get_role(row):
if row['AVG'] >= avg_mid and row['STRIKE RATE'] >= sr_mid: return 'Star'
if row['AVG'] >= avg_mid: return 'Anchor'
if row['STRIKE RATE'] >= sr_mid: return 'Aggressor'
return 'Struggler'
q_df['Role'] = q_df.apply(get_role, axis=1)
print("Setup Complete. Base metrics and roles have been calculated.")
Setup Complete. Base metrics and roles have been calculated.
# 1. Boundary Frequency (How many balls per boundary?)
# We use the 'Calculated_Balls' we just created
q_df['Balls_Per_Boundary'] = q_df['Calculated_Balls'] / (q_df['4s'] + q_df['6s']).replace(0, np.nan)
# 2. Rotation Strike Rate (How well they run singles/twos)
q_df['Non_Boundary_Runs'] = q_df['RUNS'] - q_df['BOUNDARY_RUNS']
q_df['Boundary_Balls_Est'] = (q_df['4s'] + q_df['6s']) # Estimate 1 ball per boundary
q_df['Rotation_SR'] = (q_df['Non_Boundary_Runs'] / (q_df['Calculated_Balls'] - q_df['Boundary_Balls_Est']).replace(0, np.nan)) * 100
# Cleanup
q_df.fillna(0, inplace=True)
print("Advanced metrics (Boundary Frequency & Rotation SR) are ready.")
Advanced metrics (Boundary Frequency & Rotation SR) are ready.
plt.figure(figsize=(14, 8))
# Scatter plot: Boundary Frequency vs Rotation Strike Rate
sns.scatterplot(data=q_df, x='Balls_Per_Boundary', y='Rotation_SR',
hue='Role', size='RUNS', sizes=(100, 1000), alpha=0.7, palette='Set2')
# Label key players for the report
for i, row in q_df.iterrows():
if row['Player'] in ['Hassan Azad', 'Cameron Heard', 'Nicky Kirkwood', 'Johan Cronje', 'Amay Zindal']:
plt.text(row['Balls_Per_Boundary'] - 0.5, row['Rotation_SR'] + 1, row['Player'],
fontsize=10, weight='bold', alpha=0.9)
# Formatting for a professional look
plt.gca().invert_xaxis() # Better power (lower balls per boundary) moves to the right
plt.axhline(q_df['Rotation_SR'].median(), ls='--', color='gray', alpha=0.3)
plt.axvline(q_df['Balls_Per_Boundary'].median(), ls='--', color='gray', alpha=0.3)
plt.title('The Busy Batter Map: Boundary Power vs. Strike Rotation', fontsize=16, pad=20)
plt.xlabel('Boundary Frequency (Fewer balls per boundary = More Power)')
plt.ylabel('Rotation Strike Rate (Efficiency on Singles/Twos)')
plt.grid(True, alpha=0.2)
plt.legend(bbox_to_anchor=(1.02, 1), loc='upper left')
plt.tight_layout()
plt.show()
Performance Inferences
- Stars (High Reliability & High Impact)
Inference: These players are the core of your team’s scoring. They maintain a high average while scoring quickly.
Coaching Focus: Their Rotation SR (Strike Rotation) is vital. It shows they don't just rely on boundaries but actively put the bowling side under pressure with singles and twos.
- Anchors (The Foundation)
Inference: These batters provide the stability needed to bat through the overs. They are characterized by higher-than-average consistency but a lower scoring rate.
Coaching Focus: Look at the Balls/Bdry column. The aim is to reduce this number slightly to increase their impact without losing their wicket-taking resistance.
- Aggressors (The Power Hitters)
Inference: These players may not stay at the crease as long, but their boundary-hitting frequency is elite. They are match-winners in short bursts.
Coaching Focus: If an Aggressor has a low Rot SR, they are "boundary or bust." Encouraging them to rotate the strike early in their innings can help them find rhythm before playing the big shots.
- Development Zone (The Strugglers)
Inference: This group includes players whose current form or stats are below the squad median in both categories.
Coaching Focus: The data helps identify if the struggle is with power (high Balls/Bdry) or with strike rotation (low Rot SR). Targeted practice can address these specific areas.
- The Strike Rotators (Top 10)
Inference: This is a specialized view of the top 10 players who are best at avoiding dot balls. They are the most efficient at turning the strike over, which is crucial during middle overs or against tight bowling.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
# 1. LOAD & PREPARE DATA
# Ensure 'batting_stats.csv' is in your working directory
df = pd.read_csv('batting_stats.csv')
q_df = df[df['INNS'] >= 5].copy()
# Advanced Metrics Calculation
# Calculate Balls Faced from Strike Rate
q_df['Calculated_Balls'] = (q_df['RUNS'] / q_df['STRIKE RATE']) * 100
q_df['BOUNDARY_RUNS'] = (q_df['4s'] * 4) + (q_df['6s'] * 6)
q_df['Total_Boundaries'] = q_df['4s'] + q_df['6s']
# Intent Metric: Balls per boundary (Lower is better)
q_df['Balls_Per_Boundary'] = q_df['Calculated_Balls'] / q_df['Total_Boundaries'].replace(0, np.nan)
# Rotation Metric: Scoring speed on non-boundary balls
q_df['Non_Boundary_Runs'] = q_df['RUNS'] - q_df['BOUNDARY_RUNS']
q_df['Rotation_SR'] = (q_df['Non_Boundary_Runs'] / (q_df['Calculated_Balls'] - q_df['Total_Boundaries']).replace(0, np.nan)) * 100
q_df.fillna(0, inplace=True)
# 2. DEFINE SQUAD ROLES
avg_mid, sr_mid = q_df['AVG'].median(), q_df['STRIKE RATE'].median()
def assign_role(row):
if row['AVG'] >= avg_mid and row['STRIKE RATE'] >= sr_mid: return 'Star'
if row['AVG'] >= avg_mid: return 'Anchor'
if row['STRIKE RATE'] >= sr_mid: return 'Aggressor'
return 'Struggler'
q_df['Role'] = q_df.apply(assign_role, axis=1)
# 3. PLOTTING FUNCTION FOR ALL CATEGORIES
def generate_performance_card(data, title, color, filename):
plt.close('all')
fig, ax = plt.subplots(figsize=(10, 11))
# Scatter Plot
sns.scatterplot(data=data, x='Balls_Per_Boundary', y='Rotation_SR',
s=300, color=color, alpha=0.8, edgecolor='black', ax=ax)
# Annotate Player Names
for i, row in data.iterrows():
ax.text(row['Balls_Per_Boundary'], row['Rotation_SR'] + 1, row['Player'],
ha='center', fontsize=9, weight='bold')
# Formatting
ax.invert_xaxis() # Lower 'Balls per Boundary' is better, moving it to the right
ax.set_title(title, fontsize=16, weight='bold', pad=25)
ax.set_xlabel('Power (Balls Per Boundary - Lower is better)')
ax.set_ylabel('Strike Rotation (Non-Boundary SR)')
ax.grid(True, linestyle='--', alpha=0.2)
# Prepare Table Data
table_cols = ['Player', 'AVG', 'STRIKE RATE', 'Balls_Per_Boundary', 'Rotation_SR']
table_df = data[table_cols].sort_values('Rotation_SR', ascending=False)
table_df = table_df.round({'AVG': 1, 'STRIKE RATE': 1, 'Balls_Per_Boundary': 1, 'Rotation_SR': 1})
table_df.columns = ["Player", "Avg", "SR", "B/Bdry", "Rot SR"]
# Create Table below the plot
the_table = ax.table(cellText=table_df.values,
colLabels=table_df.columns,
loc='bottom',
cellLoc='center',
bbox=[0.0, -0.4, 1.0, 0.3])
# Style Table Header and Cells
the_table.auto_set_font_size(False)
the_table.set_fontsize(11)
the_table.scale(1.2, 1.5)
for (row, col), cell in the_table.get_celld().items():
if row == 0:
cell.set_text_props(weight='bold', color='white')
cell.set_facecolor(color)
else:
cell.set_facecolor('#fdfdfd')
plt.subplots_adjust(bottom=0.35)
plt.savefig(filename, bbox_inches='tight', dpi=150)
How to Use These Insights: Strike Rotators: If you see a player in this chart but not in the "Star" chart, it means they are working very hard but perhaps need to find more boundary-hitting power to jump to the next level.
Balls Per Boundary: Look for Cameron Heard or Nicky Kirkwood here; a lower number (further to the right) indicates someone the opposition will fear.
Rotation SR: A score above 50.0 in the "Rot SR" column means the player is scoring at least a run every two balls even without hitting a boundary.
# 4. GENERATE INDIVIDUAL CHARTS
# Analysis for Stars
generate_performance_card(q_df[q_df['Role'] == 'Star'], "Squad Role: Stars", "#2ecc71", "chart_star.png")
# Analysis for Anchors
generate_performance_card(q_df[q_df['Role'] == 'Anchor'], "Squad Role: Anchors", "#3498db", "chart_anchor.png")
# Analysis for Aggressors
generate_performance_card(q_df[q_df['Role'] == 'Aggressor'], "Squad Role: Aggressors", "#e67e22", "chart_aggressor.png")
# Analysis for Strugglers
generate_performance_card(q_df[q_df['Role'] == 'Struggler'], "Squad Role: Development", "#95a5a6", "chart_struggler.png")
# Analysis for top 10 rotators
# Generate specific Strike Rotator view
top_rotators = q_df.sort_values('Rotation_SR', ascending=False).head(10)
generate_performance_card(top_rotators, "Elite Category: Strike Rotators (Top 10)", "#9b59b6", "chart_rotator.png")
The script to include every regular player (those with 5 or more innings) in the dismissal analysis. I also dynamically adjusted the chart size so the names remain clear even with a larger group.
Below is the complete code and the inferences for each analysis section.
Inferences & Actionable Insights
- Squad Role Cards (Stars, Anchors, Aggressors, Development)
Purpose: Categorizes players based on their Average (longevity) and Strike Rate (speed) relative to the team median.
Inference: This allows you to identify who needs to focus on "staying in" (Development/Aggressors) and who needs to focus on "moving the game forward" (Anchors).
Action: Use these cards to set individual training goals. For example, an Anchor might be tasked with increasing their "Non-Boundary SR" by 5 points.
- Elite Category: The Strike Rotators
Purpose: Highlights the top 10 players best at turning the strike over (avoiding dot balls).
Inference: These are your most efficient batters. They are critical for the middle overs to ensure the scoreboard never stops ticking, especially against spin or tight defensive fields.
Action: Ensure these players are paired with Aggressors to balance high-risk boundaries with low-risk strike rotation.
- Squad Scouting: Dismissal Analysis (Heatmap)
Purpose: Shows exactly how every player in the squad gets out.
Inference: Patterns here reveal technical gaps. If a player has a high count in LBW or BOWLED, they may have a vulnerability to straight, full deliveries. High CAUGHT counts often indicate risk-taking or poor shot selection.
Action: Coaches can use this "opposition scouting" on their own team to design net sessions. If the squad as a whole has a high LBW count, the next practice should focus on "playing straight."
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
# 1. LOAD & PREPARE DATA
df = pd.read_csv('batting_stats.csv')
# Analyze regular players (at least 5 innings)
q_df = df[df['INNS'] >= 5].copy()
# 2. ADVANCED METRICS CALCULATIONS
# Calculating Balls Faced from Strike Rate
q_df['Calculated_Balls'] = (q_df['RUNS'] / q_df['STRIKE RATE']) * 100
q_df['BOUNDARY_RUNS'] = (q_df['4s'] * 4) + (q_df['6s'] * 6)
q_df['Total_Boundaries'] = q_df['4s'] + q_df['6s']
# Intent Metric: Balls per boundary (Lower is better)
q_df['Balls_Per_Boundary'] = q_df['Calculated_Balls'] / q_df['Total_Boundaries'].replace(0, np.nan)
# Rotation Metric: Scoring speed on non-boundary balls (Singles/Twos)
q_df['Non_Boundary_Runs'] = q_df['RUNS'] - q_df['BOUNDARY_RUNS']
q_df['Rotation_SR'] = (q_df['Non_Boundary_Runs'] / (q_df['Calculated_Balls'] - q_df['Total_Boundaries']).replace(0, np.nan)) * 100
q_df.fillna(0, inplace=True)
# 3. ROLE ASSIGNMENT
avg_mid, sr_mid = q_df['AVG'].median(), q_df['STRIKE RATE'].median()
def assign_role(row):
if row['AVG'] >= avg_mid and row['STRIKE RATE'] >= sr_mid: return 'Star'
if row['AVG'] >= avg_mid: return 'Anchor'
if row['STRIKE RATE'] >= sr_mid: return 'Aggressor'
return 'Struggler'
q_df['Role'] = q_df.apply(assign_role, axis=1)
# 4. PLOTTING ENGINE FOR CARDS
def generate_performance_card(data, title, color, filename):
plt.close('all')
fig, ax = plt.subplots(figsize=(10, 11))
# Scatter Plot
sns.scatterplot(data=data, x='Balls_Per_Boundary', y='Rotation_SR',
s=300, color=color, alpha=0.8, edgecolor='black', ax=ax)
# Annotate Player Names
for i, row in data.iterrows():
ax.text(row['Balls_Per_Boundary'], row['Rotation_SR'] + 1, row['Player'],
ha='center', fontsize=9, weight='bold')
# Visual Adjustments
ax.invert_xaxis() # Better power players move to the right
ax.set_title(title, fontsize=16, weight='bold', pad=25)
ax.set_xlabel('Power (Balls Per Boundary - Lower is better)')
ax.set_ylabel('Strike Rotation (Non-Boundary SR)')
ax.grid(True, linestyle='--', alpha=0.2)
# Table Setup
table_cols = ['Player', 'AVG', 'STRIKE RATE', 'Balls_Per_Boundary', 'Rotation_SR']
table_df = data[table_cols].sort_values('Rotation_SR', ascending=False)
table_df = table_df.round({'AVG': 1, 'STRIKE RATE': 1, 'Balls_Per_Boundary': 1, 'Rotation_SR': 1})
table_df.columns = ["Player", "Avg", "SR", "B/Bdry", "Rot SR"]
the_table = ax.table(cellText=table_df.values,
colLabels=table_df.columns,
loc='bottom',
cellLoc='center',
bbox=[0.0, -0.4, 1.0, 0.3])
the_table.auto_set_font_size(False)
the_table.set_fontsize(11)
the_table.scale(1.2, 1.5)
for (row, col), cell in the_table.get_celld().items():
if row == 0:
cell.set_text_props(weight='bold', color='white')
cell.set_facecolor(color)
else:
cell.set_facecolor('#fdfdfd')
plt.subplots_adjust(bottom=0.35)
plt.savefig(filename, bbox_inches='tight', dpi=150)
plt.show()
# 5. EXECUTE: ROLE CARDS & ROTATORS
generate_performance_card(q_df[q_df['Role'] == 'Star'], "Squad Role: Stars", "#2ecc71", "chart_star.png")
generate_performance_card(q_df[q_df['Role'] == 'Anchor'], "Squad Role: Anchors", "#3498db", "chart_anchor.png")
generate_performance_card(q_df[q_df['Role'] == 'Aggressor'], "Squad Role: Aggressors", "#e67e22", "chart_aggressor.png")
generate_performance_card(q_df[q_df['Role'] == 'Struggler'], "Squad Role: Development", "#95a5a6", "chart_struggler.png")
top_rotators = q_df.sort_values('Rotation_SR', ascending=False).head(10)
generate_performance_card(top_rotators, "Elite Category: Strike Rotators (Top 10)", "#9b59b6", "chart_rotator.png")
# 6. EXECUTE: DISMISSAL HEATMAP (ALL PLAYERS)
plt.close('all')
dismissal_types = ['BOWLED', 'CAUGHT', 'LBW', 'STUMPED', 'RUN OUT']
all_players_dismissals = q_df.sort_values('RUNS', ascending=False).set_index('Player')[dismissal_types]
# Adjust height dynamically based on the number of players
chart_height = max(8, len(all_players_dismissals) * 0.3)
plt.figure(figsize=(12, chart_height))
sns.heatmap(all_players_dismissals, annot=True, cmap='YlGnBu', fmt='g', cbar_kws={'label': 'Number of Dismissals'})
plt.title('Squad Scouting: How every regular player gets out', fontsize=16, weight='bold', pad=20)
plt.tight_layout()
plt.savefig('chart_dismissals_all.png')
plt.show()
print("All charts generated and saved successfully.")
All charts generated and saved successfully.
Comments
Post a Comment