← Back to blog

Scrape ESPN Cricinfo: Cricket Stats, Player Averages & Match Data with Python (2026)

Scrape ESPN Cricinfo: Cricket Stats, Player Averages & Match Data with Python (2026)

If you're building a cricket analytics tool, a fantasy league engine, or just want to crunch player stats, ESPN Cricinfo is the place. It's the most comprehensive cricket database on the internet — player profiles going back decades, scorecards for every international match, Statsguru for cross-format queries, and full tournament coverage from Tests to IPL to whatever bilateral series is happening this week.

The catch: scraping it isn't trivial. Cricinfo uses a mix of server-rendered HTML and internal JSON APIs, and ESPN's bot protection is no joke. This post walks through the practical approach — how to find the right endpoints, pull structured data, and not get blocked doing it.

What Cricinfo Has (and Where It Lives)

Cricinfo's data breaks into several categories:

The URLs follow predictable patterns. A player profile looks like:

https://www.espncricinfo.com/cricketers/virat-kohli-253802

That trailing number is the player's Cricinfo ID — you'll need it for API calls. A match scorecard:

https://www.espncricinfo.com/series/ipl-2024-1410320/match/1416535

Finding the Internal APIs

This is where DevTools earns its keep. Open the Network tab, filter for XHR/Fetch, and load a player profile page. You'll see calls to endpoints like:

https://hs-consumer-api.espncricinfo.com/v1/pages/player/career?playerId=253802&shouldFilterByPrimaryBattingStyle=false

That hs-consumer-api subdomain is the good stuff. It returns clean JSON instead of HTML soup. Some useful endpoint patterns:

# Career stats
/v1/pages/player/career?playerId={id}

# Recent matches
/v1/pages/player/matches?playerId={id}&page=1

# Match summary
/v1/pages/match/summary?seriesId={series_id}&matchId={match_id}

# Match scorecard
/v1/pages/match/scorecard?seriesId={series_id}&matchId={match_id}

# Series home
/v1/pages/series/home?seriesId={id}

# Series fixtures
/v1/pages/series/fixtures?seriesId={id}

# Player search
/v1/search/global?query={name}&type=player

These aren't documented, so they can shift. Check DevTools before assuming an endpoint is stable.

Setting Up Dependencies

pip install requests pandas beautifulsoup4 curl_cffi

The curl_cffi library becomes important later when dealing with Akamai's TLS fingerprinting. Install it alongside requests.

Scraping Player Career Stats

Here's a straightforward way to pull batting averages for a player:

import requests
import pandas as pd
import time

def get_player_career_stats(player_id: int, headers: dict) -> dict:
    url = "https://hs-consumer-api.espncricinfo.com/v1/pages/player/career"
    params = {
        "playerId": player_id,
        "shouldFilterByPrimaryBattingStyle": "false"
    }
    resp = requests.get(url, params=params, headers=headers, timeout=15)
    resp.raise_for_status()
    return resp.json()

def parse_batting_averages(data: dict) -> pd.DataFrame:
    records = []
    career = data.get("career", {})
    for fmt_block in career.get("allFormats", []):
        fmt = fmt_block.get("formatType", "")
        batting = fmt_block.get("battingSummary", {})
        if batting:
            records.append({
                "format": fmt,
                "matches": batting.get("matches"),
                "innings": batting.get("innings"),
                "not_outs": batting.get("notOuts"),
                "runs": batting.get("runs"),
                "high_score": batting.get("highScore"),
                "average": batting.get("average"),
                "balls_faced": batting.get("ballsFaced"),
                "strike_rate": batting.get("strikeRate"),
                "hundreds": batting.get("hundreds"),
                "fifties": batting.get("fifties"),
                "ducks": batting.get("ducks"),
                "fours": batting.get("fours"),
                "sixes": batting.get("sixes"),
            })
    return pd.DataFrame(records)

def parse_bowling_stats(data: dict) -> pd.DataFrame:
    records = []
    career = data.get("career", {})
    for fmt_block in career.get("allFormats", []):
        fmt = fmt_block.get("formatType", "")
        bowling = fmt_block.get("bowlingSummary", {})
        if bowling and bowling.get("wickets"):
            records.append({
                "format": fmt,
                "matches": bowling.get("matches"),
                "innings": bowling.get("innings"),
                "balls": bowling.get("balls"),
                "runs": bowling.get("runs"),
                "wickets": bowling.get("wickets"),
                "best_innings": bowling.get("bestInnings"),
                "best_match": bowling.get("bestMatch"),
                "economy": bowling.get("economy"),
                "average": bowling.get("average"),
                "strike_rate": bowling.get("strikeRate"),
                "five_wickets": bowling.get("fiveWickets"),
                "ten_wickets": bowling.get("tenWickets"),
            })
    return pd.DataFrame(records)


headers = {
    "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
    "Accept": "application/json",
    "Referer": "https://www.espncricinfo.com/",
}

data = get_player_career_stats(253802, headers)  # Virat Kohli
batting_df = parse_batting_averages(data)
bowling_df = parse_bowling_stats(data)
print(batting_df)

The JSON structure varies by player — some fields are null if they never batted or bowled in a format. Always guard against missing keys.

Scraping Match Scorecards

For a full scorecard, hit the match scorecard endpoint and dig into the innings data:

def get_match_scorecard(series_id: int, match_id: int, headers: dict) -> dict:
    url = "https://hs-consumer-api.espncricinfo.com/v1/pages/match/scorecard"
    params = {"seriesId": series_id, "matchId": match_id}
    resp = requests.get(url, params=params, headers=headers, timeout=15)
    resp.raise_for_status()
    return resp.json()

def parse_innings(scorecard_data: dict) -> list[dict]:
    rows = []
    for innings in scorecard_data.get("content", {}).get("innings", []):
        team = innings.get("team", {}).get("name", "")
        innings_num = innings.get("inningsId", 1)

        for batter in innings.get("inningsBatsmen", []):
            rows.append({
                "innings": innings_num,
                "team": team,
                "player": batter.get("player", {}).get("name"),
                "player_id": batter.get("player", {}).get("id"),
                "runs": batter.get("runs"),
                "balls": batter.get("balls"),
                "minutes": batter.get("minutes"),
                "fours": batter.get("fours"),
                "sixes": batter.get("sixes"),
                "strike_rate": batter.get("strikeRate"),
                "dismissal": batter.get("dismissalText", {}).get("short"),
                "fielder": batter.get("dismissalFielders", [{}])[0].get("name") if batter.get("dismissalFielders") else None,
                "bowler": batter.get("dismissalBowler", {}).get("name"),
            })
    return rows

def parse_bowling(scorecard_data: dict) -> list[dict]:
    rows = []
    for innings in scorecard_data.get("content", {}).get("innings", []):
        team = innings.get("team", {}).get("name", "")
        innings_num = innings.get("inningsId", 1)

        for bowler in innings.get("inningsBowlers", []):
            rows.append({
                "innings": innings_num,
                "batting_team": team,
                "bowler": bowler.get("player", {}).get("name"),
                "bowler_id": bowler.get("player", {}).get("id"),
                "overs": bowler.get("overs"),
                "maidens": bowler.get("maidens"),
                "runs": bowler.get("conceded"),
                "wickets": bowler.get("wickets"),
                "wides": bowler.get("wides"),
                "no_balls": bowler.get("noballs"),
                "economy": bowler.get("economy"),
            })
    return rows


# Get a specific IPL match scorecard
scorecard = get_match_scorecard(1410320, 1416535, headers)
batting = parse_innings(scorecard)
bowling = parse_bowling(scorecard)

batting_df = pd.DataFrame(batting)
bowling_df = pd.DataFrame(bowling)
print(batting_df[["player", "runs", "balls", "fours", "sixes", "dismissal"]])

Finding Player IDs

Before you can scrape a player, you need their Cricinfo ID. Two ways to find it:

From the URL: Navigate to any player profile page. The URL ends with player-name-XXXXXX where XXXXXX is the ID.

Via search API: Use the search endpoint to find players programmatically:

def search_players(query: str, headers: dict) -> list[dict]:
    \"\"\"Search for players by name.\"\"\"
    url = "https://hs-consumer-api.espncricinfo.com/v1/search/global"
    params = {"query": query, "type": "player"}

    resp = requests.get(url, params=params, headers=headers, timeout=10)
    resp.raise_for_status()
    data = resp.json()

    players = []
    for item in data.get("results", {}).get("players", []):
        players.append({
            "id": item.get("id"),
            "name": item.get("name"),
            "country": item.get("country", {}).get("name"),
            "batting_style": item.get("battingStyle"),
            "bowling_style": item.get("bowlingStyle"),
        })
    return players


# Find all players named "Smith"
results = search_players("Smith", headers)
for p in results:
    print(f"{p['id']}: {p['name']} ({p['country']})")

Fetching Series and Tournament Data

For IPL, World Cup, or bilateral series data, the series endpoints give you fixtures and results:

def get_series_fixtures(series_id: int, headers: dict) -> list[dict]:
    \"\"\"Get all matches in a series.\"\"\"
    url = "https://hs-consumer-api.espncricinfo.com/v1/pages/series/fixtures"
    params = {"seriesId": series_id}

    resp = requests.get(url, params=params, headers=headers, timeout=15)
    resp.raise_for_status()
    data = resp.json()

    matches = []
    for match in data.get("content", {}).get("matches", []):
        matches.append({
            "match_id": match.get("objectId"),
            "series_id": series_id,
            "description": match.get("description"),
            "date": match.get("startDate"),
            "venue": match.get("ground", {}).get("smallName"),
            "team1": match.get("teams", [{}])[0].get("team", {}).get("name"),
            "team2": match.get("teams", [{}])[1].get("team", {}).get("name") if len(match.get("teams", [])) > 1 else None,
            "status": match.get("statusText"),
            "result": match.get("winnerTeamId"),
        })
    return matches


# IPL 2024 series ID: 1410320
fixtures = get_series_fixtures(1410320, headers)
for match in fixtures[:5]:
    print(f"{match['date'][:10]}: {match['team1']} vs {match['team2']} @ {match['venue']}")

The Bot Protection Problem

ESPN runs Akamai Bot Manager across its properties, and Cricinfo is no exception. What this means in practice:

For casual one-off scripts pulling a few players, rotating your user-agent and adding delays usually works. For anything at scale — bulk historical pulls, daily stat syncs, Statsguru scraping — you need residential proxies.

I've had good results with ThorData for this. Their residential pool has solid coverage and the per-GB pricing makes sense for sports data workloads that don't run 24/7. The key advantage is IP reputation — residential IPs simply don't trigger Akamai the same way cloud IPs do.

Plugging a proxy into the requests setup:

proxies = {
    "http": "http://username:[email protected]:PORT",
    "https": "http://username:[email protected]:PORT",
}

resp = requests.get(url, headers=headers, proxies=proxies, timeout=20)

If you're hitting persistent TLS fingerprint issues, curl_cffi is worth adding — it impersonates a real browser's TLS handshake:

from curl_cffi import requests as cffi_requests

resp = cffi_requests.get(url, headers=headers, proxies=proxies, impersonate="chrome124")

Working with Statsguru

Statsguru is Cricinfo's advanced stats query tool at https://stats.espncricinfo.com/ci/engine/stats/index.html. It's the right place for aggregated data — "all Test batsmen averaging 50+ with 50+ innings" type queries.

The URL parameters map directly to the filter UI. You can construct queries programmatically:

from bs4 import BeautifulSoup

def query_statsguru(params: dict, headers: dict, proxies: dict = None) -> pd.DataFrame:
    base = "https://stats.espncricinfo.com/ci/engine/stats/index.html"
    resp = requests.get(base, params=params, headers=headers, proxies=proxies, timeout=20)
    resp.raise_for_status()
    soup = BeautifulSoup(resp.text, "html.parser")
    table = soup.find("table", class_="engineTable")
    if not table:
        return pd.DataFrame()
    # pandas read_html handles the table parsing
    dfs = pd.read_html(str(table))
    return dfs[0] if dfs else pd.DataFrame()


# All Test batsmen with 5000+ runs, sorted by average
test_batting_params = {
    "class": 1,          # 1=Test, 2=ODI, 3=T20I
    "type": "batting",
    "filter": "advanced",
    "runsmin1": 5000,
    "order_by": "average",
    "template": "results",
}

# All bowlers in T20Is with 100+ wickets
t20_bowling_params = {
    "class": 3,
    "type": "bowling",
    "filter": "advanced",
    "wicketsmin": 100,
    "order_by": "wickets",
    "template": "results",
}

df_batters = query_statsguru(test_batting_params, headers)
print(df_batters.head(20))

Statsguru pages are HTML-rendered, not JSON, so you're parsing tables. The column names are inconsistent across different query types — expect some cleanup work.

Batch Processing Multiple Players

For building a database of player stats, process multiple players efficiently:

import sqlite3
import json

def init_cricket_db(db_path: str = "cricket.db") -> sqlite3.Connection:
    conn = sqlite3.connect(db_path)
    conn.execute(\"\"\"
        CREATE TABLE IF NOT EXISTS player_batting (
            player_id INTEGER,
            format TEXT,
            matches INTEGER, innings INTEGER,
            not_outs INTEGER, runs INTEGER,
            high_score TEXT, average REAL,
            strike_rate REAL, hundreds INTEGER,
            fifties INTEGER, ducks INTEGER,
            scraped_at TEXT,
            PRIMARY KEY (player_id, format)
        )
    \"\"\")
    conn.execute(\"\"\"
        CREATE TABLE IF NOT EXISTS players (
            player_id INTEGER PRIMARY KEY,
            name TEXT, country TEXT,
            batting_style TEXT, bowling_style TEXT
        )
    \"\"\")
    conn.commit()
    return conn

def scrape_player_batch(player_ids: list[int], db_path: str = "cricket.db", proxies: dict = None):
    conn = init_cricket_db(db_path)
    headers = {
        "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
        "Accept": "application/json",
        "Referer": "https://www.espncricinfo.com/",
    }

    for i, player_id in enumerate(player_ids):
        print(f"[{i+1}/{len(player_ids)}] Player {player_id}")
        try:
            data = get_player_career_stats(player_id, headers)
            batting_df = parse_batting_averages(data)

            player_info = data.get("player", {})
            conn.execute(
                "INSERT OR REPLACE INTO players VALUES (?,?,?,?,?)",
                (
                    player_id,
                    player_info.get("name"),
                    player_info.get("country", {}).get("name"),
                    player_info.get("batting", {}).get("style"),
                    player_info.get("bowling", {}).get("style"),
                )
            )

            for _, row in batting_df.iterrows():
                conn.execute(
                    "INSERT OR REPLACE INTO player_batting VALUES (?,?,?,?,?,?,?,?,?,?,?,?,CURRENT_TIMESTAMP)",
                    (
                        player_id, row.get("format"), row.get("matches"),
                        row.get("innings"), row.get("not_outs"), row.get("runs"),
                        str(row.get("high_score")), row.get("average"),
                        row.get("strike_rate"), row.get("hundreds"),
                        row.get("fifties"), row.get("ducks"),
                    )
                )
            conn.commit()

        except Exception as e:
            print(f"  Error: {e}")

        time.sleep(random.uniform(2, 5))

    conn.close()

import random
player_ids = [253802, 277916, 322560, 390702, 481896]  # Kohli, Root, Smith, Williamson, Babar
scrape_player_batch(player_ids)

Rate Limiting and Storage

Don't hammer it. A delay of 2-5 seconds between requests is the minimum. For bulk historical pulls, spread the load:

import time
import random

def safe_request(url, params, headers, proxies=None, min_delay=2.0, max_delay=5.0):
    time.sleep(random.uniform(min_delay, max_delay))
    resp = requests.get(url, params=params, headers=headers, proxies=proxies, timeout=20)
    if resp.status_code == 429:
        print("Rate limited, waiting 30s...")
        time.sleep(30)
        resp = requests.get(url, params=params, headers=headers, proxies=proxies, timeout=20)
    elif resp.status_code == 403:
        print("Blocked (403). Try rotating proxy or adding more delay.")
        raise requests.HTTPError(f"403 Forbidden: {url}")
    resp.raise_for_status()
    return resp

For storage, CSV works fine for small datasets. For anything you'll query repeatedly, SQLite via pandas is cleaner:

import sqlite3

conn = sqlite3.connect("cricket_stats.db")
batting_df.to_sql("batting_averages", conn, if_exists="append", index=False)
conn.close()

# Pull it back
df_from_db = pd.read_sql("SELECT * FROM batting_averages WHERE format='TEST' AND average > 40 ORDER BY average DESC", conn)

Building a Fantasy Cricket Engine

Fantasy cricket platforms (Dream11, MPL, FanCode) score players based on runs, wickets, catches, and other in-match events. Cricinfo's ball-by-ball data lets you backtest fantasy scoring strategies:

FANTASY_SCORING = {
    "run": 1,
    "four_bonus": 1,      # bonus for boundaries
    "six_bonus": 2,       # bonus for sixes
    "half_century": 8,    # 50+ runs
    "century": 16,        # 100+ runs
    "duck": -2,           # dismissed for 0
    "wicket": 25,
    "lbw_bowled_bonus": 8,  # LBW or bowled
    "five_wicket_haul": 16,
    "catch": 8,
    "stumping": 12,
    "run_out": 6,
}

def calculate_fantasy_score(batting_row: dict, bowling_row: dict | None = None) -> float:
    score = 0.0

    # Batting
    runs = batting_row.get("runs", 0) or 0
    score += runs * FANTASY_SCORING["run"]

    fours = batting_row.get("fours", 0) or 0
    sixes = batting_row.get("sixes", 0) or 0
    score += fours * FANTASY_SCORING["four_bonus"]
    score += sixes * FANTASY_SCORING["six_bonus"]

    if runs >= 100:
        score += FANTASY_SCORING["century"]
    elif runs >= 50:
        score += FANTASY_SCORING["half_century"]

    if runs == 0 and batting_row.get("dismissal"):
        score += FANTASY_SCORING["duck"]

    # Bowling
    if bowling_row:
        wickets = bowling_row.get("wickets", 0) or 0
        score += wickets * FANTASY_SCORING["wicket"]
        if wickets >= 5:
            score += FANTASY_SCORING["five_wicket_haul"]

    return round(score, 1)

Quick Reference

The data's all there. Cricinfo has better historical coverage than most paid sports APIs, and once you've mapped out the internal endpoints for your use case, it's pretty reliable to work with.

Building a Player Comparison Dashboard

With batting stats for multiple players, you can build cross-format comparison views:

def compare_players(player_ids: list[int], format_filter: str = "TEST") -> pd.DataFrame:
    """Compare batting stats for multiple players in a given format."""
    headers = {
        "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
        "Accept": "application/json",
        "Referer": "https://www.espncricinfo.com/",
    }

    all_stats = []
    for player_id in player_ids:
        try:
            data = get_player_career_stats(player_id, headers)
            batting_df = parse_batting_averages(data)
            player_name = data.get("player", {}).get("name", str(player_id))
            country = data.get("player", {}).get("country", {}).get("name", "")

            fmt_row = batting_df[batting_df["format"] == format_filter]
            if not fmt_row.empty:
                row = fmt_row.iloc[0].to_dict()
                row["player"] = player_name
                row["country"] = country
                row["player_id"] = player_id
                all_stats.append(row)

            time.sleep(random.uniform(2, 4))
        except Exception as e:
            print(f"Error for player {player_id}: {e}")

    df = pd.DataFrame(all_stats)
    if not df.empty:
        df = df.sort_values("average", ascending=False)
    return df


# Compare "Fab Four" Test batsmen
fab_four_ids = [253802, 303669, 267192, 299461]  # Kohli, Root, Smith, Williamson
comparison = compare_players(fab_four_ids, format_filter="TEST")
print(comparison[["player", "country", "matches", "runs", "average", "hundreds", "fifties"]].to_string(index=False))

Scraping Live Match Data

For live matches, the commentary endpoint delivers ball-by-ball updates:

def get_live_match_summary(series_id: int, match_id: int, headers: dict) -> dict:
    """Get live or most recent match summary."""
    url = "https://hs-consumer-api.espncricinfo.com/v1/pages/match/summary"
    params = {"seriesId": series_id, "matchId": match_id}

    resp = requests.get(url, params=params, headers=headers, timeout=15)
    resp.raise_for_status()
    data = resp.json()

    content = data.get("content", {})
    match_info = content.get("matchHeader", {})

    return {
        "match_id": match_id,
        "description": match_info.get("description"),
        "status": match_info.get("status"),
        "result": match_info.get("result", {}).get("resultType"),
        "winner": match_info.get("result", {}).get("winningTeam"),
        "win_margin": match_info.get("result", {}).get("winningMargin"),
        "toss": match_info.get("tossResults", {}).get("decision"),
        "venue": match_info.get("venue", {}).get("ground"),
        "series": match_info.get("seriesName"),
        "format": match_info.get("matchFormat"),
    }

Head-to-Head Records

One of the most popular cricket queries is head-to-head records between two teams:

def get_head_to_head(team1_id: int, team2_id: int, headers: dict) -> pd.DataFrame:
    """Get head-to-head records from Statsguru."""
    params = {
        "class": 1,       # Test cricket
        "type": "team",
        "team": team1_id,
        "opposition": team2_id,
        "template": "results",
        "view": "summary",
    }

    url = "https://stats.espncricinfo.com/ci/engine/stats/index.html"
    resp = requests.get(url, params=params, headers=headers, timeout=20)
    soup = BeautifulSoup(resp.text, "html.parser")

    table = soup.find("table", class_="engineTable")
    if table:
        df = pd.read_html(str(table))[0]
        return df
    return pd.DataFrame()

Handling JSON Structure Changes

Cricinfo occasionally restructures their API responses. Defensive parsing protects your pipeline:

def safe_get(data: dict, *keys, default=None):
    """Safely navigate nested dict keys."""
    current = data
    for key in keys:
        if isinstance(current, dict):
            current = current.get(key)
        elif isinstance(current, list) and isinstance(key, int) and len(current) > key:
            current = current[key]
        else:
            return default
        if current is None:
            return default
    return current


# Example: safely get player batting average
avg = safe_get(data, "career", "allFormats", 0, "battingSummary", "average", default=0.0)

Exporting Data for Visualization

Once you have player stats in SQLite, export to formats compatible with visualization tools:

import json

def export_player_stats_json(db_path: str, output_path: str):
    """Export all player batting stats as JSON for frontend use."""
    conn = sqlite3.connect(db_path)

    players = conn.execute("""
        SELECT p.player_id, p.name, p.country,
               b.format, b.matches, b.runs, b.average,
               b.strike_rate, b.hundreds, b.fifties
        FROM players p
        JOIN player_batting b ON p.player_id = b.player_id
        ORDER BY p.name, b.format
    """).fetchall()

    # Group by player
    from collections import defaultdict
    player_dict = defaultdict(lambda: {"name": "", "country": "", "formats": {}})

    for row in players:
        pid, name, country, fmt, matches, runs, avg, sr, h, f = row
        player_dict[pid]["name"] = name
        player_dict[pid]["country"] = country
        player_dict[pid]["formats"][fmt] = {
            "matches": matches, "runs": runs, "average": avg,
            "strike_rate": sr, "hundreds": h, "fifties": f,
        }

    with open(output_path, "w") as f:
        json.dump(list(player_dict.values()), f, indent=2)

    print(f"Exported {len(player_dict)} players to {output_path}")
    conn.close()