← Back to blog

How to Scrape HackerRank Challenges in 2026: Python, API Access & Practice Data

How to Scrape HackerRank Challenges in 2026: Python, API Access & Practice Data

HackerRank hosts tens of thousands of coding challenges across algorithms, data structures, machine learning, and domain-specific tracks. If you're building a coding practice aggregator, tracking problem difficulty distributions, researching the competitive programming landscape, or creating custom study lists, HackerRank's data is valuable.

This guide covers what data is available, how HackerRank's site works under the hood, and how to extract challenge metadata, leaderboards, and submission statistics reliably.

What Data Is Available on HackerRank

HackerRank exposes several categories of public data:

Most of this is accessible without authentication. Contest solutions and editorial content require login.

How HackerRank Serves Its Data

HackerRank runs a React single-page application backed by a REST API. The good news: they use versioned, predictable API endpoints. The less good news: the app requires JavaScript to render, and many endpoints need a valid CSRF token.

The core API pattern is:

https://www.hackerrank.com/rest/contests/master/challenges?offset=0&limit=20&track=algorithms

The master slug refers to the main practice area. For specific contests, the slug matches the contest URL.

Response format: JSON, well-structured, with a models array and pagination metadata.

Authentication: Not required for challenge listings and leaderboards. Required for submission history, user-specific data, and editorial access.

Scraping Challenge Listings Without Auth

The practice challenge endpoint is publicly accessible:

import httpx
import time
import json
from pathlib import Path

BASE = "https://www.hackerrank.com"

HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
                   "(KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36",
    "Accept": "application/json",
    "Accept-Language": "en-US,en;q=0.9",
    "Referer": "https://www.hackerrank.com/domains/algorithms",
    "X-Requested-With": "XMLHttpRequest",
}

TRACK_SLUGS = [
    "algorithms",
    "data-structures",
    "mathematics",
    "artificial-intelligence",
    "databases",
    "shell",
    "python",
    "java",
    "c",
    "cpp",
]

def fetch_challenges_page(
    client: httpx.Client,
    track: str,
    offset: int = 0,
    limit: int = 50,
    difficulty: str = None,
) -> dict:
    params = {
        "offset": offset,
        "limit": limit,
        "track": track,
    }
    if difficulty:
        params["difficulty"] = difficulty  # easy, medium, hard, expert

    resp = client.get(
        f"{BASE}/rest/contests/master/challenges",
        params=params,
        headers=HEADERS,
        timeout=20,
    )

    if resp.status_code == 429:
        retry_after = int(resp.headers.get("Retry-After", 30))
        print(f"  Rate limited — waiting {retry_after}s")
        time.sleep(retry_after)
        return fetch_challenges_page(client, track, offset, limit, difficulty)

    resp.raise_for_status()
    return resp.json()

def scrape_track(track: str, max_challenges: int = 500) -> list[dict]:
    challenges = []
    offset = 0
    limit = 50

    with httpx.Client() as client:
        while offset < max_challenges:
            data = fetch_challenges_page(client, track, offset, limit)
            models = data.get("models", [])
            if not models:
                break

            for ch in models:
                challenges.append({
                    "slug": ch.get("slug"),
                    "name": ch.get("name"),
                    "difficulty": ch.get("difficulty_name"),
                    "max_score": ch.get("max_score"),
                    "success_ratio": ch.get("success_ratio"),
                    "total_count": ch.get("total_count"),
                    "preview": ch.get("preview"),
                    "track": track,
                    "tags": [t.get("name") for t in ch.get("tags", [])],
                    "primary_technology": ch.get("primary_technology"),
                })

            total = data.get("total", 0)
            offset += limit
            print(f"  {track}: {len(challenges)}/{total} challenges")

            if len(challenges) >= total:
                break

            time.sleep(1.5)

    return challenges

if __name__ == "__main__":
    all_challenges = []
    for track in TRACK_SLUGS:
        print(f"Scraping track: {track}")
        challenges = scrape_track(track)
        all_challenges.extend(challenges)
        print(f"  Total so far: {len(all_challenges)}")
        time.sleep(3)

    out = Path("hackerrank_challenges.json")
    out.write_text(json.dumps(all_challenges, indent=2))
    print(f"\nSaved {len(all_challenges)} challenges to {out}")

Fetching Challenge Detail Pages

Each challenge has its own JSON endpoint with full problem metadata:

def fetch_challenge_detail(client: httpx.Client, slug: str) -> dict:
    resp = client.get(
        f"{BASE}/rest/contests/master/challenges/{slug}",
        headers=HEADERS,
        timeout=20,
    )

    if resp.status_code == 404:
        return {}

    if resp.status_code == 429:
        time.sleep(30)
        return fetch_challenge_detail(client, slug)

    resp.raise_for_status()
    model = resp.json().get("model", {})

    return {
        "slug": model.get("slug"),
        "name": model.get("name"),
        "body_html": model.get("body_html"),  # full problem statement
        "difficulty": model.get("difficulty_name"),
        "max_score": model.get("max_score"),
        "success_ratio": model.get("success_ratio"),
        "total_submissions": model.get("total_count"),
        "time_limit": model.get("time_limit"),
        "memory_limit": model.get("memory_limit"),
        "editorial_available": model.get("editorial") is not None,
        "languages": [l.get("name") for l in model.get("languages", [])],
        "input_format": model.get("input_format"),
        "output_format": model.get("output_format"),
        "constraints": model.get("constraints"),
    }

# Enrich a batch of challenges with full details
def enrich_challenges(
    challenges: list[dict],
    max_items: int = 100,
    delay: float = 2.0,
) -> list[dict]:
    enriched = []
    with httpx.Client() as client:
        for i, ch in enumerate(challenges[:max_items]):
            slug = ch.get("slug")
            if not slug:
                enriched.append(ch)
                continue

            try:
                detail = fetch_challenge_detail(client, slug)
                ch.update(detail)
            except httpx.HTTPError as e:
                print(f"  Failed to fetch detail for {slug}: {e}")

            enriched.append(ch)

            if (i + 1) % 10 == 0:
                print(f"  Enriched {i+1}/{min(max_items, len(challenges))}")
            time.sleep(delay)

    return enriched

Scraping Leaderboards

Per-challenge leaderboards show who solved it, their score, and time elapsed:

def fetch_leaderboard(
    client: httpx.Client,
    challenge_slug: str,
    offset: int = 0,
    limit: int = 50,
) -> dict:
    resp = client.get(
        f"{BASE}/rest/contests/master/challenges/{challenge_slug}/leaderboard",
        params={"offset": offset, "limit": limit},
        headers=HEADERS,
        timeout=20,
    )
    resp.raise_for_status()
    return resp.json()

def scrape_challenge_leaderboard(
    challenge_slug: str,
    max_entries: int = 200,
) -> list[dict]:
    entries = []
    offset = 0
    limit = 50

    with httpx.Client() as client:
        while offset < max_entries:
            data = fetch_leaderboard(client, challenge_slug, offset, limit)
            models = data.get("models", [])
            if not models:
                break

            for entry in models:
                entries.append({
                    "rank": entry.get("rank"),
                    "hacker": entry.get("hacker"),
                    "score": entry.get("score"),
                    "time_taken": entry.get("time_taken"),
                    "language": entry.get("language"),
                })

            offset += limit
            total = data.get("total", 0)
            if offset >= total:
                break
            time.sleep(1.0)

    return entries

# Example: top solvers for a famous problem
board = scrape_challenge_leaderboard("solve-me-first")
for entry in board[:10]:
    print(f"  #{entry['rank']:>4} | {entry['hacker']:<20} | {entry['score']:>3} pts | {entry['language']}")

Contest Data

HackerRank hosts regular contests. The contest list endpoint returns structured data:

def fetch_contests(
    contest_type: str = "upcoming",  # upcoming, current, past
    offset: int = 0,
    limit: int = 20,
) -> list[dict]:
    resp = httpx.get(
        f"{BASE}/rest/contests",
        params={
            "offset": offset,
            "limit": limit,
            "filter": contest_type,
        },
        headers=HEADERS,
        timeout=20,
    )
    resp.raise_for_status()
    contests = []
    for c in resp.json().get("models", []):
        contests.append({
            "slug": c.get("slug"),
            "name": c.get("name"),
            "start_time": c.get("epoch_starttime"),
            "end_time": c.get("epoch_endtime"),
            "description": c.get("description"),
            "participant_count": c.get("total_participants"),
        })
    return contests

# Get upcoming contests
upcoming = fetch_contests("upcoming")
for c in upcoming:
    print(f"  {c['name']}: {c.get('participant_count', 0)} participants")

# Get challenges from a specific contest
def fetch_contest_challenges(contest_slug: str) -> list[dict]:
    resp = httpx.get(
        f"{BASE}/rest/contests/{contest_slug}/challenges",
        headers=HEADERS,
        timeout=20,
    )
    resp.raise_for_status()
    return resp.json().get("models", [])

Authenticated Requests: Session Cookies

Some HackerRank data requires authentication — notably submission history, editorial solutions, and user-specific stats. You can use browser session cookies to authenticate requests:

import httpx
from bs4 import BeautifulSoup

def create_authenticated_session(
    session_cookie: str,
    csrf_token: str,
) -> httpx.Client:
    client = httpx.Client()
    client.cookies.set("_hrank_session", session_cookie, domain="www.hackerrank.com")
    client.headers.update({
        "X-CSRF-Token": csrf_token,
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                       "AppleWebKit/537.36 Chrome/131.0.0.0 Safari/537.36",
        "Referer": "https://www.hackerrank.com/",
    })
    return client

def fetch_my_submissions(
    client: httpx.Client,
    challenge_slug: str,
) -> list[dict]:
    resp = client.get(
        f"https://www.hackerrank.com/rest/contests/master/challenges/{challenge_slug}/submissions",
        timeout=20,
    )
    resp.raise_for_status()
    return resp.json().get("models", [])

# Get CSRF token from cookies after login
# Open DevTools > Application > Cookies > _csrf_token after logging in
# SESSION = "your-session-cookie-value"
# CSRF = "your-csrf-token-value"
# client = create_authenticated_session(SESSION, CSRF)

Anti-Bot Detection on HackerRank

HackerRank's defenses are moderate compared to sites like LinkedIn or Indeed:

  1. Rate limiting by IP — Aggressive scraping from a single IP triggers 429 responses. The rate limit appears to be around 60-100 requests/minute.
  2. User-Agent checks — Requests with Python's default user agent (python-httpx/...) are blocked or return empty results.
  3. Referer validation — The API checks that requests come from www.hackerrank.com pages via the Referer header.
  4. Session validation for sensitive endpoints — The submission and editorial endpoints require valid session cookies.

What Does Not Work

What Works

For large-scale data collection — say, pulling metadata for all 4,000+ algorithm challenges — residential proxies are essential. ThorData provides rotating residential IPs that distribute requests across different IP addresses, preventing the per-IP rate limiting HackerRank uses:

import httpx
import time
from typing import Optional

# ThorData residential proxy — https://thordata.partnerstack.com/partner/0a0x4nzb (or [Oxylabs](https://oxylabs.go2cloud.org/aff_c?offer_id=7&aff_id=2066&url_id=174))
PROXY_CONFIG = {
    "http://": "http://USER:[email protected]:9000",
    "https://": "http://USER:[email protected]:9000",
}

def fetch_with_proxy(
    url: str,
    params: dict = None,
    max_retries: int = 5,
) -> Optional[dict]:
    backoff = 2
    for attempt in range(max_retries):
        try:
            resp = httpx.get(
                url,
                params=params,
                headers=HEADERS,
                proxy=PROXY_CONFIG,
                timeout=20,
            )
            if resp.status_code == 429:
                wait = backoff ** attempt
                print(f"  Rate limited, waiting {wait}s (attempt {attempt+1})")
                time.sleep(wait)
                continue
            resp.raise_for_status()
            return resp.json()
        except httpx.TimeoutException:
            print(f"  Timeout on attempt {attempt+1}")
            time.sleep(backoff ** attempt)
        except httpx.HTTPError as e:
            print(f"  HTTP error: {e}")
            return None
    return None

Parsing Problem HTML Content

Challenge body HTML contains the problem statement, constraints, sample input/output, and test cases. Parsing this into structured data requires BeautifulSoup:

from bs4 import BeautifulSoup
import re

def parse_problem_html(html_body: str) -> dict:
    if not html_body:
        return {}

    soup = BeautifulSoup(html_body, "html.parser")

    # HackerRank structures problem statements with h3 headings
    sections = {}
    current_section = "description"
    current_content = []

    for element in soup.children:
        if element.name == "h3":
            if current_content:
                sections[current_section] = " ".join(
                    c.strip() for c in current_content if c.strip()
                )
            current_section = element.get_text(strip=True).lower()
            current_content = []
        elif element.name in ["p", "ul", "ol", "pre"]:
            current_content.append(element.get_text())

    if current_content:
        sections[current_section] = " ".join(
            c.strip() for c in current_content if c.strip()
        )

    # Extract numeric constraints
    constraints_text = sections.get("constraints", "")
    n_constraints = re.findall(
        r'\b(?:1|10)\^{?\d+}?\s*(?:<=|<)\s*(?:N|n|Q|q)\s*(?:<=|<)\s*(?:1|10)\^{?\d+}?',
        constraints_text
    )

    # Extract sample test cases
    samples = []
    sample_inputs = soup.find_all("pre", class_=re.compile("sample-input"))
    sample_outputs = soup.find_all("pre", class_=re.compile("sample-output"))
    for inp, out in zip(sample_inputs, sample_outputs):
        samples.append({
            "input": inp.get_text(),
            "output": out.get_text(),
        })

    return {
        **sections,
        "n_constraints": n_constraints,
        "sample_cases": samples,
    }

Building a Difficulty Distribution Dataset

One useful analysis: map difficulty to success ratio across all tracks to find which problem categories are hardest in practice:

import json
import statistics
from pathlib import Path
from collections import defaultdict

def analyze_difficulty_distribution(challenges_file: str) -> dict:
    data = json.loads(Path(challenges_file).read_text())

    by_difficulty = defaultdict(list)
    by_track = defaultdict(lambda: defaultdict(list))

    for ch in data:
        diff = ch.get("difficulty", "Unknown")
        ratio = ch.get("success_ratio")
        track = ch.get("track", "unknown")

        if ratio is not None:
            by_difficulty[diff].append(float(ratio))
            by_track[track][diff].append(float(ratio))

    report = {}
    print(f"{'Difficulty':<12} {'Count':>6} {'Avg Success':>12} {'Median':>8}")
    print("-" * 42)
    for diff in ["Easy", "Medium", "Hard", "Expert"]:
        ratios = by_difficulty.get(diff, [])
        if not ratios:
            continue
        avg = statistics.mean(ratios)
        med = statistics.median(ratios)
        print(f"{diff:<12} {len(ratios):>6} {avg:>11.1f}% {med:>7.1f}%")
        report[diff] = {"count": len(ratios), "avg_success": avg, "median_success": med}

    return report

# Run analysis on collected data
# analyze_difficulty_distribution("hackerrank_challenges.json")

Bulk Scraping All Tracks

Here is a complete pipeline that collects all challenges across every domain:

#!/usr/bin/env python3
"""
HackerRank challenge dataset builder.
Collects challenge metadata across all major tracks and saves to JSON.
"""

import httpx
import json
import time
from pathlib import Path
from datetime import datetime

BASE = "https://www.hackerrank.com"
OUTPUT_DIR = Path("hackerrank_data")
OUTPUT_DIR.mkdir(exist_ok=True)

ALL_TRACKS = [
    "algorithms", "data-structures", "mathematics",
    "artificial-intelligence", "databases", "shell",
    "python", "java", "c", "cpp", "javascript",
    "ruby", "linux-shell", "distributed-systems",
]

HEADERS = {
    "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
                   "AppleWebKit/537.36 Chrome/131.0.0.0 Safari/537.36",
    "Accept": "application/json",
    "Referer": "https://www.hackerrank.com/",
}

def fetch_all_challenges_for_track(track: str) -> list[dict]:
    all_ch = []
    offset = 0
    limit = 50

    with httpx.Client() as client:
        while True:
            try:
                resp = client.get(
                    f"{BASE}/rest/contests/master/challenges",
                    params={"offset": offset, "limit": limit, "track": track},
                    headers=HEADERS,
                    timeout=20,
                )
                if resp.status_code == 429:
                    time.sleep(30)
                    continue
                resp.raise_for_status()
                data = resp.json()
            except Exception as e:
                print(f"  Error fetching {track} at offset {offset}: {e}")
                break

            models = data.get("models", [])
            if not models:
                break

            for ch in models:
                all_ch.append({
                    "slug": ch.get("slug"),
                    "name": ch.get("name"),
                    "difficulty": ch.get("difficulty_name"),
                    "track": track,
                    "max_score": ch.get("max_score"),
                    "success_ratio": ch.get("success_ratio"),
                    "total_submissions": ch.get("total_count"),
                    "tags": [t.get("name") for t in ch.get("tags", [])],
                    "primary_technology": ch.get("primary_technology"),
                    "url": f"https://www.hackerrank.com/challenges/{ch.get('slug')}/problem",
                })

            total = data.get("total", 0)
            offset += limit
            if offset >= total or offset >= 1000:
                break

            time.sleep(1.5)

    return all_ch

def run_full_collection():
    all_challenges = []
    stats = {}

    for track in ALL_TRACKS:
        print(f"Collecting: {track}")
        challenges = fetch_all_challenges_for_track(track)
        all_challenges.extend(challenges)
        stats[track] = len(challenges)
        print(f"  {len(challenges)} challenges collected")
        time.sleep(3)  # pause between tracks

    # Save full dataset
    timestamp = datetime.now().strftime("%Y%m%d_%H%M")
    out_file = OUTPUT_DIR / f"hackerrank_all_{timestamp}.json"
    out_file.write_text(json.dumps({
        "collected_at": datetime.now().isoformat(),
        "total_challenges": len(all_challenges),
        "by_track": stats,
        "challenges": all_challenges,
    }, indent=2))

    print(f"\nComplete! {len(all_challenges)} challenges saved to {out_file}")
    for track, count in sorted(stats.items(), key=lambda x: x[1], reverse=True):
        print(f"  {track:<30} {count:>4} challenges")

if __name__ == "__main__":
    run_full_collection()

Use Cases

With this data collected, you can build:

Practice schedulers — Sort problems by success_ratio and difficulty to build a progressive curriculum. Start with Easy problems that have 80%+ success rates, then move to Medium problems with 40-60% rates.

Skill gap analysis — Compare difficulty distributions across tracks. AI/ML challenges tend to have lower success rates than equivalent-difficulty Algorithms problems.

Hiring signal research — Many companies use HackerRank for technical screening. The most-attempted problems in certain domains reveal what skills companies test for.

Contest trackers — Monitor upcoming contests, participant counts, and winner data to identify when activity spikes in the competitive programming community.

Study plan generators — Given a target company or role, recommend a sequence of HackerRank problems that match that company's screening preferences.

Rate Limits and Staying Unblocked

HackerRank applies IP-level rate limiting. The thresholds are approximately:

Request Pattern Result
1 request/second sustained Works fine
3-5 requests/second burst Occasional 429
10+ requests/second Consistent 429s
No delays for 100+ requests IP block (temporary)

For sustainable high-volume collection, use ThorData's residential proxy pool. Rotating residential IPs means each request appears to come from a different user, effectively eliminating per-IP rate limits. With proxy rotation, you can safely increase your request rate to 5-10/second across the pool.

Always add jitter to your delays — fixed intervals (exactly 1.000s) are a bot signal. Randomize between 0.8 and 2.5 seconds for more human-like patterns.

Summary

HackerRank exposes a clean REST API that returns JSON for challenge listings, leaderboards, and contest data. No authentication is needed for most public data. The key parameters are the track slug and pagination offsets.

The main scraping challenges are rate limiting (solvable with residential proxies) and proper header configuration (a real Chrome User-Agent and Referer header is required). With both in place, you can collect the full challenge catalog, difficulty distributions, success rates, and leaderboard data for competitive programming research or study tool development.

Advanced: Contest Scraping and Historical Data

HackerRank hosts contests regularly. Building a historical contest dataset reveals difficulty curves, prize structures, and participation patterns:

import httpx
import json
import time
from pathlib import Path
from datetime import datetime

BASE = "https://www.hackerrank.com"
HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 Chrome/131.0.0.0 Safari/537.36",
    "Accept": "application/json",
    "Referer": "https://www.hackerrank.com/",
}

def fetch_past_contests(max_pages: int = 10) -> list[dict]:
    all_contests = []
    offset = 0
    limit = 20

    with httpx.Client() as client:
        while offset < max_pages * limit:
            resp = client.get(
                f"{BASE}/rest/contests",
                params={"filter": "past", "offset": offset, "limit": limit},
                headers=HEADERS,
                timeout=20,
            )
            if resp.status_code == 429:
                time.sleep(30)
                continue
            resp.raise_for_status()
            data = resp.json()
            models = data.get("models", [])
            if not models:
                break

            for c in models:
                all_contests.append({
                    "slug": c.get("slug"),
                    "name": c.get("name"),
                    "start_time": c.get("epoch_starttime"),
                    "end_time": c.get("epoch_endtime"),
                    "description": c.get("description", "")[:200],
                    "participant_count": c.get("total_participants", 0),
                    "challenge_count": c.get("challenges_count", 0),
                })

            offset += limit
            total = data.get("total", 0)
            print(f"  Fetched {len(all_contests)}/{total} contests")
            if offset >= total:
                break
            time.sleep(2)

    return all_contests


def fetch_contest_leaderboard(contest_slug: str, max_entries: int = 100) -> list[dict]:
    entries = []
    offset = 0
    limit = 25

    with httpx.Client() as client:
        while offset < max_entries:
            resp = client.get(
                f"{BASE}/rest/contests/{contest_slug}/leaderboard",
                params={"offset": offset, "limit": limit},
                headers=HEADERS,
                timeout=20,
            )
            if resp.status_code != 200:
                break
            data = resp.json()
            models = data.get("models", [])
            if not models:
                break

            for entry in models:
                entries.append({
                    "rank": entry.get("rank"),
                    "hacker": entry.get("hacker"),
                    "score": entry.get("score"),
                    "time_taken": entry.get("time_taken"),
                })

            offset += limit
            if offset >= data.get("total", 0):
                break
            time.sleep(1)

    return entries

Tracking Problem Difficulty Over Time

One useful insight from HackerRank data: comparing stated difficulty labels against actual success rates reveals that "Medium" problems vary enormously in practice:

import json
import statistics
from pathlib import Path
from collections import defaultdict


def analyze_difficulty_vs_success(challenges_file: str) -> None:
    data = json.loads(Path(challenges_file).read_text())
    challenges = data if isinstance(data, list) else data.get("challenges", [])

    by_difficulty = defaultdict(list)
    by_track_difficulty = defaultdict(lambda: defaultdict(list))

    for ch in challenges:
        diff = ch.get("difficulty", "Unknown")
        ratio = ch.get("success_ratio")
        track = ch.get("track", "unknown")

        if ratio is not None:
            try:
                r = float(ratio)
                by_difficulty[diff].append(r)
                by_track_difficulty[track][diff].append(r)
            except (ValueError, TypeError):
                pass

    print("Difficulty vs. Actual Success Rate:")
    print(f"{'Difficulty':<12} {'Count':>6} {'Avg%':>8} {'Median%':>9} {'Min%':>7} {'Max%':>7}")
    print("-" * 52)
    for diff in ["Easy", "Medium", "Hard", "Expert"]:
        ratios = by_difficulty.get(diff, [])
        if not ratios:
            continue
        print(
            f"{diff:<12} {len(ratios):>6} {statistics.mean(ratios):>7.1f}% "
            f"{statistics.median(ratios):>8.1f}% "
            f"{min(ratios):>6.1f}% "
            f"{max(ratios):>6.1f}%"
        )

    print("\nHardest tracks by average success rate:")
    track_avgs = []
    for track, diff_data in by_track_difficulty.items():
        all_ratios = [r for ratios in diff_data.values() for r in ratios]
        if all_ratios:
            track_avgs.append((track, statistics.mean(all_ratios), len(all_ratios)))

    for track, avg, count in sorted(track_avgs, key=lambda x: x[1])[:10]:
        print(f"  {track:<30} {avg:>6.1f}% avg success ({count} challenges)")

Extracting Hiring Signal Data

Many companies use HackerRank for technical interviews. The platforms they test on and the problem types they favor reveal hiring preferences:

import re
from bs4 import BeautifulSoup


def extract_interview_kit_companies() -> list[dict]:
    """
    HackerRank's Interview Preparation Kit page lists companies
    that commonly use specific problem types for screening.
    Scrape the interview kit category structure.
    """
    url = "https://www.hackerrank.com/interview/interview-preparation-kit"
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 Chrome/131.0.0.0 Safari/537.36",
    }

    import httpx
    resp = httpx.get(url, headers=headers, timeout=20)
    if resp.status_code != 200:
        return []

    soup = BeautifulSoup(resp.text, "html.parser")

    # The prep kit page contains topic names and their challenge counts
    topics = []
    for section in soup.select(".challenge-category"):
        name_el = section.select_one("h3")
        count_el = section.select_one(".challenge-count")
        if name_el:
            topics.append({
                "topic": name_el.get_text(strip=True),
                "challenge_count": count_el.get_text(strip=True) if count_el else None,
            })

    return topics


def get_company_tagged_challenges(company_name: str) -> list[dict]:
    """Fetch challenges tagged with a specific company name."""
    # Company-tagged problems appear in the URL pattern:
    # /challenges?filters%5Bcompanies%5D%5B%5D=COMPANY_NAME
    import httpx, time

    resp = httpx.get(
        "https://www.hackerrank.com/rest/contests/master/challenges",
        params={
            "filters[companies][]": company_name,
            "offset": 0,
            "limit": 50,
        },
        headers=HEADERS,
        timeout=20,
    )
    if resp.status_code != 200:
        return []

    challenges = []
    for ch in resp.json().get("models", []):
        challenges.append({
            "slug": ch.get("slug"),
            "name": ch.get("name"),
            "difficulty": ch.get("difficulty_name"),
            "success_ratio": ch.get("success_ratio"),
            "total_submissions": ch.get("total_count"),
        })
    return challenges

Building a Study Planner

Using collected challenge data, build a personalized study planner that sequences problems optimally:

import json
from pathlib import Path
from collections import defaultdict


def build_study_plan(
    challenges_file: str,
    target_track: str = "algorithms",
    daily_problems: int = 3,
    days: int = 30,
) -> list[dict]:
    data = json.loads(Path(challenges_file).read_text())
    challenges = data if isinstance(data, list) else data.get("challenges", [])

    # Filter to target track
    track_challenges = [
        ch for ch in challenges
        if ch.get("track") == target_track
    ]

    # Sequence by difficulty: Easy -> Medium -> Hard -> Expert
    difficulty_order = {"Easy": 0, "Medium": 1, "Hard": 2, "Expert": 3}
    track_challenges.sort(key=lambda x: (
        difficulty_order.get(x.get("difficulty", "Medium"), 1),
        -(x.get("success_ratio") or 50),  # within each level, start with higher success rate
    ))

    # Build daily plan
    plan = []
    day = 1
    idx = 0
    while day <= days and idx < len(track_challenges):
        daily = []
        for _ in range(daily_problems):
            if idx >= len(track_challenges):
                break
            ch = track_challenges[idx]
            daily.append({
                "name": ch.get("name"),
                "slug": ch.get("slug"),
                "difficulty": ch.get("difficulty"),
                "success_rate": ch.get("success_ratio"),
                "url": f"https://www.hackerrank.com/challenges/{ch.get('slug')}/problem",
            })
            idx += 1
        if daily:
            plan.append({"day": day, "problems": daily})
        day += 1

    return plan


def print_study_plan_week(plan: list[dict]) -> None:
    print("30-Day HackerRank Study Plan (first week):")
    print()
    for day_plan in plan[:7]:
        day = day_plan["day"]
        print(f"Day {day}:")
        for p in day_plan["problems"]:
            rate = f"{p['success_rate']:.0f}%" if p.get('success_rate') else "N/A"
            print(f"  - {p['name']} ({p['difficulty']}, {rate} success rate)")
        print()

Saving and Exporting Data

import json
import csv
from pathlib import Path
from datetime import datetime


def save_challenge_dataset(
    challenges: list[dict],
    output_dir: str = "hackerrank_data",
) -> None:
    out = Path(output_dir)
    out.mkdir(exist_ok=True)
    timestamp = datetime.now().strftime("%Y%m%d_%H%M")

    # JSON
    json_file = out / f"challenges_{timestamp}.json"
    json_file.write_text(json.dumps({
        "collected_at": datetime.now().isoformat(),
        "total": len(challenges),
        "challenges": challenges,
    }, indent=2))

    # CSV
    csv_file = out / f"challenges_{timestamp}.csv"
    fieldnames = ["slug", "name", "difficulty", "track", "max_score",
                  "success_ratio", "total_submissions", "url"]
    with open(csv_file, "w", newline="", encoding="utf-8") as f:
        writer = csv.DictWriter(f, fieldnames=fieldnames, extrasaction="ignore")
        writer.writeheader()
        writer.writerows(challenges)

    print(f"Saved {len(challenges)} challenges:")
    print(f"  JSON: {json_file}")
    print(f"  CSV:  {csv_file}")


def export_leaderboard_analysis(
    challenge_slug: str,
    leaderboard: list[dict],
    output_dir: str = "hackerrank_data",
) -> None:
    from collections import Counter

    out = Path(output_dir)
    out.mkdir(exist_ok=True)

    # Language distribution
    lang_counts = Counter(entry.get("language", "Unknown") for entry in leaderboard)

    # Score distribution
    scores = [entry.get("score", 0) for entry in leaderboard if entry.get("score") is not None]

    analysis = {
        "challenge": challenge_slug,
        "total_entries": len(leaderboard),
        "language_distribution": dict(lang_counts.most_common(10)),
        "score_stats": {
            "min": min(scores) if scores else None,
            "max": max(scores) if scores else None,
            "avg": sum(scores) / len(scores) if scores else None,
        },
        "perfect_scores": sum(1 for s in scores if s == max(scores) if scores),
    }

    out_file = out / f"leaderboard_{challenge_slug}.json"
    out_file.write_text(json.dumps(analysis, indent=2))
    print(f"Leaderboard analysis saved to {out_file}")
    print(f"  Top languages: {dict(lang_counts.most_common(5))}")

Combining with Other Job Data

HackerRank challenge data becomes most valuable when cross-referenced with job market data. Companies that heavily tag problems with their names are often actively using those problem types in screening:

import json
from pathlib import Path
from collections import defaultdict


def match_challenges_to_job_requirements(
    challenges_file: str,
    jobs_file: str,
) -> dict:
    """Cross-reference HackerRank problems with LinkedIn job descriptions."""
    challenges = json.loads(Path(challenges_file).read_text())
    if isinstance(challenges, dict):
        challenges = challenges.get("challenges", [])

    jobs = json.loads(Path(jobs_file).read_text())

    # Build skill tag index from challenges
    skill_index = defaultdict(list)
    for ch in challenges:
        for tag in ch.get("tags", []):
            skill_index[tag.lower()].append(ch)

    # Find which challenge tags appear in job descriptions
    skill_demand = defaultdict(int)
    for job in jobs:
        desc = (job.get("description", "") or "").lower()
        for skill_tag, tagged_challenges in skill_index.items():
            if skill_tag in desc:
                skill_demand[skill_tag] += 1

    # Return skills ranked by job market demand
    ranked = sorted(skill_demand.items(), key=lambda x: x[1], reverse=True)

    print("HackerRank topics most demanded in job descriptions:")
    for skill, count in ranked[:15]:
        tagged = skill_index[skill]
        print(f"  {skill:<30} mentioned in {count} job descriptions ({len(tagged)} challenges)")

    return dict(ranked)

Conclusion

HackerRank's REST API is unusually accessible for a major platform. Challenge metadata, leaderboards, and contest data are available without authentication. With proper rate limiting (1-2 second delays, residential proxy rotation from ThorData for high-volume use), you can build comprehensive datasets covering the full challenge catalog across all tracks.

The data enables applications ranging from personalized study planners and difficulty analysis tools to hiring intelligence platforms that map problem types to company screening preferences. The success ratio field alone — the percentage of all-time attempts that produced correct solutions — provides a ground-truth difficulty signal that is far more reliable than the subjective Easy/Medium/Hard labels.

Automated Progress Tracking

Track your own progress through HackerRank challenges using the authenticated API endpoints and build a personal dashboard:

import json
import time
from pathlib import Path
from datetime import datetime
from collections import defaultdict


def build_personal_progress_tracker(
    session_id: str,
    csrf_token: str,
    output_dir: str = "progress_tracking",
) -> None:
    import httpx

    out = Path(output_dir)
    out.mkdir(exist_ok=True)

    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 Chrome/131.0.0.0 Safari/537.36",
        "Accept": "application/json",
        "X-CSRF-Token": csrf_token,
    }
    cookies = {"_hrank_session": session_id}

    progress_data = {}

    # Fetch solved challenges per track
    tracks = ["algorithms", "data-structures", "mathematics", "python", "databases"]
    for track in tracks:
        resp = httpx.get(
            "https://www.hackerrank.com/rest/contests/master/challenges",
            params={"track": track, "limit": 50, "offset": 0, "solved": 1},
            headers=headers,
            cookies=cookies,
            timeout=20,
        )
        if resp.status_code != 200:
            continue

        data = resp.json()
        solved = data.get("models", [])
        progress_data[track] = {
            "solved": len(solved),
            "total": data.get("total", 0),
            "challenges": [
                {"slug": ch.get("slug"), "name": ch.get("name"), "difficulty": ch.get("difficulty_name")}
                for ch in solved
            ],
        }
        print(f"  {track}: {len(solved)} solved")
        time.sleep(1.5)

    # Save progress snapshot
    snapshot = {
        "timestamp": datetime.now().isoformat(),
        "tracks": progress_data,
        "total_solved": sum(d["solved"] for d in progress_data.values()),
    }

    out_file = out / f"progress_{datetime.now().strftime('%Y%m%d_%H%M')}.json"
    out_file.write_text(json.dumps(snapshot, indent=2))
    print(f"Progress snapshot saved: {out_file}")


def visualize_progress_over_time(progress_dir: str) -> None:
    files = sorted(Path(progress_dir).glob("progress_*.json"))
    if not files:
        print("No progress snapshots found")
        return

    timeline = []
    for f in files:
        data = json.loads(f.read_text())
        timeline.append({
            "timestamp": data["timestamp"],
            "total_solved": data["total_solved"],
            "by_track": {track: info["solved"] for track, info in data["tracks"].items()},
        })

    print("Progress over time:")
    print(f"{'Date':<12} {'Total':>7} {'Change':>7}")
    print("-" * 30)
    for i, point in enumerate(timeline):
        date = point["timestamp"][:10]
        total = point["total_solved"]
        change = total - timeline[i-1]["total_solved"] if i > 0 else 0
        change_str = f"+{change}" if change >= 0 else str(change)
        print(f"{date:<12} {total:>7} {change_str:>7}")

API Reference Summary

Quick reference for all HackerRank endpoints used in this guide:

Endpoint Description Auth Required
/rest/contests/master/challenges List challenges by track No
/rest/contests/master/challenges/{slug} Challenge detail No
/rest/contests/master/challenges/{slug}/leaderboard Per-challenge leaderboard No
/rest/contests List contests No
/rest/contests/{slug}/challenges Contest challenges No
/rest/contests/{slug}/leaderboard Contest leaderboard No
/rest/users/{username} Public user profile No
/rest/contests/master/challenges?solved=1 Your solved challenges Yes
/rest/contests/master/challenges/{slug}/submissions Your submissions Yes

The unauthenticated endpoints are the richest source of data and require only proper User-Agent and Referer headers. For the authenticated endpoints, you need a valid _hrank_session cookie and X-CSRF-Token header obtained by logging in through a browser.

For high-volume data collection, use ThorData's residential proxy pool to distribute requests across multiple IPs and avoid the per-IP rate limiting that triggers after roughly 60-100 requests per minute.

Tracking HackerRank Hiring Challenges Over Time

Companies regularly publish "HackerRank Certification Tests" and hiring challenges under their own subdomains (e.g., company.hackerrank.com). Monitoring newly published contests is valuable for job seekers and competitive intelligence researchers.

import httpx
import json
from datetime import datetime, timedelta

def fetch_active_contests(min_age_days: int = 0, max_age_days: int = 30) -> list[dict]:
    """Fetch contests that started within a date window."""
    url = "https://www.hackerrank.com/rest/contests"
    params = {
        "offset": 0,
        "limit": 100,
        "sort_by": "created_at",
        "sort_order": "desc",
    }
    headers = {
        "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7)",
        "Accept": "application/json",
    }
    cutoff_newer = datetime.utcnow() - timedelta(days=min_age_days)
    cutoff_older = datetime.utcnow() - timedelta(days=max_age_days)
    results = []
    with httpx.Client(headers=headers, timeout=15) as client:
        while True:
            resp = client.get(url, params=params)
            resp.raise_for_status()
            data = resp.json()
            contests = data.get("models", [])
            if not contests:
                break
            for contest in contests:
                created_str = contest.get("created_at", "")
                try:
                    created = datetime.fromisoformat(created_str.replace("Z", "+00:00"))
                    created = created.replace(tzinfo=None)
                except Exception:
                    continue
                if created < cutoff_older:
                    return results  # Too old, stop paginating
                if created <= cutoff_newer:
                    results.append({
                        "slug": contest.get("slug"),
                        "name": contest.get("name"),
                        "description": contest.get("description", "")[:200],
                        "created_at": created_str,
                        "end_time": contest.get("end_time"),
                        "is_hiring": "hiring" in contest.get("name", "").lower()
                                     or "interview" in contest.get("name", "").lower(),
                    })
            params["offset"] += len(contests)
    return results

if __name__ == "__main__":
    contests = fetch_active_contests(max_age_days=7)
    hiring = [c for c in contests if c["is_hiring"]]
    print(f"Found {len(contests)} recent contests, {len(hiring)} look like hiring challenges")
    for c in hiring[:5]:
        print(f"  {c['name']} — {c['slug']}")

Building a HackerRank Study Progress Tracker

When preparing for technical interviews, systematic tracking beats ad-hoc practice. Combine the challenge listing scraper with a local SQLite database:

import sqlite3
from pathlib import Path
from datetime import datetime

DB_PATH = Path.home() / ".hackerrank_tracker.db"

def init_tracker():
    conn = sqlite3.connect(DB_PATH)
    conn.executescript("""
        CREATE TABLE IF NOT EXISTS challenges (
            slug TEXT PRIMARY KEY,
            name TEXT,
            difficulty TEXT,
            domain TEXT,
            subdomain TEXT,
            score INTEGER,
            success_ratio REAL
        );
        CREATE TABLE IF NOT EXISTS attempts (
            id INTEGER PRIMARY KEY AUTOINCREMENT,
            challenge_slug TEXT,
            attempted_at TEXT,
            solved INTEGER DEFAULT 0,
            notes TEXT,
            FOREIGN KEY (challenge_slug) REFERENCES challenges(slug)
        );
        CREATE VIEW IF NOT EXISTS study_progress AS
            SELECT c.domain, c.subdomain, c.difficulty,
                   COUNT(DISTINCT c.slug) AS total_challenges,
                   COUNT(DISTINCT CASE WHEN a.solved=1 THEN a.challenge_slug END) AS solved,
                   ROUND(100.0 * COUNT(DISTINCT CASE WHEN a.solved=1 THEN a.challenge_slug END)
                         / COUNT(DISTINCT c.slug), 1) AS pct_complete
            FROM challenges c
            LEFT JOIN attempts a ON c.slug = a.challenge_slug
            GROUP BY c.domain, c.subdomain, c.difficulty
            ORDER BY c.domain, c.subdomain, c.difficulty;
    """)
    conn.commit()
    return conn

def mark_solved(conn, slug: str, notes: str = ""):
    conn.execute(
        "INSERT INTO attempts (challenge_slug, attempted_at, solved, notes) VALUES (?,?,1,?)",
        (slug, datetime.utcnow().isoformat(), notes)
    )
    conn.commit()
    print(f"Marked {slug} as solved.")

def show_progress(conn, domain: str = None):
    query = "SELECT * FROM study_progress"
    params = []
    if domain:
        query += " WHERE domain = ?"
        params.append(domain)
    rows = conn.execute(query, params).fetchall()
    print(f"{'Domain':<20} {'Subdomain':<25} {'Difficulty':<12} {'Solved/Total':<15} {'%'}")
    print("-" * 80)
    for row in rows:
        print(f"{row[0]:<20} {row[1]:<25} {row[2]:<12} {row[4]}/{row[3]:<15} {row[5]}%")

Run show_progress(conn, domain="Algorithms") after each study session to see your completion rate per subdomain. This pairs naturally with the bulk challenge scraper to auto-populate the challenges table from HackerRank's public API.

Rate Limit Reference and Proxy Strategy

HackerRank's public API is relatively permissive but still enforces rate limits:

Endpoint Observed Limit Notes
/rest/contests/master/challenges ~60 req/min Lower for unauthenticated
/rest/contests/{slug}/challenges ~30 req/min Per-contest
/rest/hackers/{username}/recent_challenges ~20 req/min Profile endpoints
Challenge detail pages ~10 req/min HTML pages, stricter

For bulk scraping, rotate IPs using ThorData residential proxies to avoid hitting per-IP limits:

import itertools

PROXY_POOL = [
    f"http://user-{i}:[email protected]:9000"
    for i in range(1, 6)
]
proxy_cycle = itertools.cycle(PROXY_POOL)

def get_client_with_proxy() -> httpx.Client:
    proxy_url = next(proxy_cycle)
    return httpx.Client(
        proxies={"https://": proxy_url, "http://": proxy_url},
        headers={"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"},
        timeout=15,
    )

Residential IPs from ThorData provide clean exit nodes that haven't been flagged by HackerRank's bot detection systems.