How to Scrape HackerRank Challenges in 2026: Python, API Access & Practice Data
How to Scrape HackerRank Challenges in 2026: Python, API Access & Practice Data
HackerRank hosts tens of thousands of coding challenges across algorithms, data structures, machine learning, and domain-specific tracks. If you're building a coding practice aggregator, tracking problem difficulty distributions, researching the competitive programming landscape, or creating custom study lists, HackerRank's data is valuable.
This guide covers what data is available, how HackerRank's site works under the hood, and how to extract challenge metadata, leaderboards, and submission statistics reliably.
What Data Is Available on HackerRank
HackerRank exposes several categories of public data:
- Challenge listings — problem titles, difficulty ratings (Easy/Medium/Hard/Expert), success rates, total submission counts
- Challenge domains — organized by track: Algorithms, Data Structures, Mathematics, Artificial Intelligence, etc.
- Problem metadata — tags, editorial availability, maximum score, time limits
- Leaderboards — per-challenge and per-contest rankings with score and time data
- Contest data — scheduled and past hackathons with participant counts
- User profiles — public profiles with solve counts, badges, and skill ratings
Most of this is accessible without authentication. Contest solutions and editorial content require login.
How HackerRank Serves Its Data
HackerRank runs a React single-page application backed by a REST API. The good news: they use versioned, predictable API endpoints. The less good news: the app requires JavaScript to render, and many endpoints need a valid CSRF token.
The core API pattern is:
https://www.hackerrank.com/rest/contests/master/challenges?offset=0&limit=20&track=algorithms
The master slug refers to the main practice area. For specific contests, the slug matches the contest URL.
Response format: JSON, well-structured, with a models array and pagination metadata.
Authentication: Not required for challenge listings and leaderboards. Required for submission history, user-specific data, and editorial access.
Scraping Challenge Listings Without Auth
The practice challenge endpoint is publicly accessible:
import httpx
import time
import json
from pathlib import Path
BASE = "https://www.hackerrank.com"
HEADERS = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
"(KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36",
"Accept": "application/json",
"Accept-Language": "en-US,en;q=0.9",
"Referer": "https://www.hackerrank.com/domains/algorithms",
"X-Requested-With": "XMLHttpRequest",
}
TRACK_SLUGS = [
"algorithms",
"data-structures",
"mathematics",
"artificial-intelligence",
"databases",
"shell",
"python",
"java",
"c",
"cpp",
]
def fetch_challenges_page(
client: httpx.Client,
track: str,
offset: int = 0,
limit: int = 50,
difficulty: str = None,
) -> dict:
params = {
"offset": offset,
"limit": limit,
"track": track,
}
if difficulty:
params["difficulty"] = difficulty # easy, medium, hard, expert
resp = client.get(
f"{BASE}/rest/contests/master/challenges",
params=params,
headers=HEADERS,
timeout=20,
)
if resp.status_code == 429:
retry_after = int(resp.headers.get("Retry-After", 30))
print(f" Rate limited — waiting {retry_after}s")
time.sleep(retry_after)
return fetch_challenges_page(client, track, offset, limit, difficulty)
resp.raise_for_status()
return resp.json()
def scrape_track(track: str, max_challenges: int = 500) -> list[dict]:
challenges = []
offset = 0
limit = 50
with httpx.Client() as client:
while offset < max_challenges:
data = fetch_challenges_page(client, track, offset, limit)
models = data.get("models", [])
if not models:
break
for ch in models:
challenges.append({
"slug": ch.get("slug"),
"name": ch.get("name"),
"difficulty": ch.get("difficulty_name"),
"max_score": ch.get("max_score"),
"success_ratio": ch.get("success_ratio"),
"total_count": ch.get("total_count"),
"preview": ch.get("preview"),
"track": track,
"tags": [t.get("name") for t in ch.get("tags", [])],
"primary_technology": ch.get("primary_technology"),
})
total = data.get("total", 0)
offset += limit
print(f" {track}: {len(challenges)}/{total} challenges")
if len(challenges) >= total:
break
time.sleep(1.5)
return challenges
if __name__ == "__main__":
all_challenges = []
for track in TRACK_SLUGS:
print(f"Scraping track: {track}")
challenges = scrape_track(track)
all_challenges.extend(challenges)
print(f" Total so far: {len(all_challenges)}")
time.sleep(3)
out = Path("hackerrank_challenges.json")
out.write_text(json.dumps(all_challenges, indent=2))
print(f"\nSaved {len(all_challenges)} challenges to {out}")
Fetching Challenge Detail Pages
Each challenge has its own JSON endpoint with full problem metadata:
def fetch_challenge_detail(client: httpx.Client, slug: str) -> dict:
resp = client.get(
f"{BASE}/rest/contests/master/challenges/{slug}",
headers=HEADERS,
timeout=20,
)
if resp.status_code == 404:
return {}
if resp.status_code == 429:
time.sleep(30)
return fetch_challenge_detail(client, slug)
resp.raise_for_status()
model = resp.json().get("model", {})
return {
"slug": model.get("slug"),
"name": model.get("name"),
"body_html": model.get("body_html"), # full problem statement
"difficulty": model.get("difficulty_name"),
"max_score": model.get("max_score"),
"success_ratio": model.get("success_ratio"),
"total_submissions": model.get("total_count"),
"time_limit": model.get("time_limit"),
"memory_limit": model.get("memory_limit"),
"editorial_available": model.get("editorial") is not None,
"languages": [l.get("name") for l in model.get("languages", [])],
"input_format": model.get("input_format"),
"output_format": model.get("output_format"),
"constraints": model.get("constraints"),
}
# Enrich a batch of challenges with full details
def enrich_challenges(
challenges: list[dict],
max_items: int = 100,
delay: float = 2.0,
) -> list[dict]:
enriched = []
with httpx.Client() as client:
for i, ch in enumerate(challenges[:max_items]):
slug = ch.get("slug")
if not slug:
enriched.append(ch)
continue
try:
detail = fetch_challenge_detail(client, slug)
ch.update(detail)
except httpx.HTTPError as e:
print(f" Failed to fetch detail for {slug}: {e}")
enriched.append(ch)
if (i + 1) % 10 == 0:
print(f" Enriched {i+1}/{min(max_items, len(challenges))}")
time.sleep(delay)
return enriched
Scraping Leaderboards
Per-challenge leaderboards show who solved it, their score, and time elapsed:
def fetch_leaderboard(
client: httpx.Client,
challenge_slug: str,
offset: int = 0,
limit: int = 50,
) -> dict:
resp = client.get(
f"{BASE}/rest/contests/master/challenges/{challenge_slug}/leaderboard",
params={"offset": offset, "limit": limit},
headers=HEADERS,
timeout=20,
)
resp.raise_for_status()
return resp.json()
def scrape_challenge_leaderboard(
challenge_slug: str,
max_entries: int = 200,
) -> list[dict]:
entries = []
offset = 0
limit = 50
with httpx.Client() as client:
while offset < max_entries:
data = fetch_leaderboard(client, challenge_slug, offset, limit)
models = data.get("models", [])
if not models:
break
for entry in models:
entries.append({
"rank": entry.get("rank"),
"hacker": entry.get("hacker"),
"score": entry.get("score"),
"time_taken": entry.get("time_taken"),
"language": entry.get("language"),
})
offset += limit
total = data.get("total", 0)
if offset >= total:
break
time.sleep(1.0)
return entries
# Example: top solvers for a famous problem
board = scrape_challenge_leaderboard("solve-me-first")
for entry in board[:10]:
print(f" #{entry['rank']:>4} | {entry['hacker']:<20} | {entry['score']:>3} pts | {entry['language']}")
Contest Data
HackerRank hosts regular contests. The contest list endpoint returns structured data:
def fetch_contests(
contest_type: str = "upcoming", # upcoming, current, past
offset: int = 0,
limit: int = 20,
) -> list[dict]:
resp = httpx.get(
f"{BASE}/rest/contests",
params={
"offset": offset,
"limit": limit,
"filter": contest_type,
},
headers=HEADERS,
timeout=20,
)
resp.raise_for_status()
contests = []
for c in resp.json().get("models", []):
contests.append({
"slug": c.get("slug"),
"name": c.get("name"),
"start_time": c.get("epoch_starttime"),
"end_time": c.get("epoch_endtime"),
"description": c.get("description"),
"participant_count": c.get("total_participants"),
})
return contests
# Get upcoming contests
upcoming = fetch_contests("upcoming")
for c in upcoming:
print(f" {c['name']}: {c.get('participant_count', 0)} participants")
# Get challenges from a specific contest
def fetch_contest_challenges(contest_slug: str) -> list[dict]:
resp = httpx.get(
f"{BASE}/rest/contests/{contest_slug}/challenges",
headers=HEADERS,
timeout=20,
)
resp.raise_for_status()
return resp.json().get("models", [])
Authenticated Requests: Session Cookies
Some HackerRank data requires authentication — notably submission history, editorial solutions, and user-specific stats. You can use browser session cookies to authenticate requests:
import httpx
from bs4 import BeautifulSoup
def create_authenticated_session(
session_cookie: str,
csrf_token: str,
) -> httpx.Client:
client = httpx.Client()
client.cookies.set("_hrank_session", session_cookie, domain="www.hackerrank.com")
client.headers.update({
"X-CSRF-Token": csrf_token,
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
"AppleWebKit/537.36 Chrome/131.0.0.0 Safari/537.36",
"Referer": "https://www.hackerrank.com/",
})
return client
def fetch_my_submissions(
client: httpx.Client,
challenge_slug: str,
) -> list[dict]:
resp = client.get(
f"https://www.hackerrank.com/rest/contests/master/challenges/{challenge_slug}/submissions",
timeout=20,
)
resp.raise_for_status()
return resp.json().get("models", [])
# Get CSRF token from cookies after login
# Open DevTools > Application > Cookies > _csrf_token after logging in
# SESSION = "your-session-cookie-value"
# CSRF = "your-csrf-token-value"
# client = create_authenticated_session(SESSION, CSRF)
Anti-Bot Detection on HackerRank
HackerRank's defenses are moderate compared to sites like LinkedIn or Indeed:
- Rate limiting by IP — Aggressive scraping from a single IP triggers 429 responses. The rate limit appears to be around 60-100 requests/minute.
- User-Agent checks — Requests with Python's default user agent (
python-httpx/...) are blocked or return empty results. - Referer validation — The API checks that requests come from
www.hackerrank.compages via the Referer header. - Session validation for sensitive endpoints — The submission and editorial endpoints require valid session cookies.
What Does Not Work
- Bare
requests.get()with no headers - Concurrent requests without delays (triggers 429 within seconds)
- Datacenter IPs for bulk scraping (flagged quickly)
What Works
- Proper headers with real Chrome User-Agent and Referer
- 1-2 second delays between requests
- Residential proxy rotation for sustained scraping
For large-scale data collection — say, pulling metadata for all 4,000+ algorithm challenges — residential proxies are essential. ThorData provides rotating residential IPs that distribute requests across different IP addresses, preventing the per-IP rate limiting HackerRank uses:
import httpx
import time
from typing import Optional
# ThorData residential proxy — https://thordata.partnerstack.com/partner/0a0x4nzb (or [Oxylabs](https://oxylabs.go2cloud.org/aff_c?offer_id=7&aff_id=2066&url_id=174))
PROXY_CONFIG = {
"http://": "http://USER:[email protected]:9000",
"https://": "http://USER:[email protected]:9000",
}
def fetch_with_proxy(
url: str,
params: dict = None,
max_retries: int = 5,
) -> Optional[dict]:
backoff = 2
for attempt in range(max_retries):
try:
resp = httpx.get(
url,
params=params,
headers=HEADERS,
proxy=PROXY_CONFIG,
timeout=20,
)
if resp.status_code == 429:
wait = backoff ** attempt
print(f" Rate limited, waiting {wait}s (attempt {attempt+1})")
time.sleep(wait)
continue
resp.raise_for_status()
return resp.json()
except httpx.TimeoutException:
print(f" Timeout on attempt {attempt+1}")
time.sleep(backoff ** attempt)
except httpx.HTTPError as e:
print(f" HTTP error: {e}")
return None
return None
Parsing Problem HTML Content
Challenge body HTML contains the problem statement, constraints, sample input/output, and test cases. Parsing this into structured data requires BeautifulSoup:
from bs4 import BeautifulSoup
import re
def parse_problem_html(html_body: str) -> dict:
if not html_body:
return {}
soup = BeautifulSoup(html_body, "html.parser")
# HackerRank structures problem statements with h3 headings
sections = {}
current_section = "description"
current_content = []
for element in soup.children:
if element.name == "h3":
if current_content:
sections[current_section] = " ".join(
c.strip() for c in current_content if c.strip()
)
current_section = element.get_text(strip=True).lower()
current_content = []
elif element.name in ["p", "ul", "ol", "pre"]:
current_content.append(element.get_text())
if current_content:
sections[current_section] = " ".join(
c.strip() for c in current_content if c.strip()
)
# Extract numeric constraints
constraints_text = sections.get("constraints", "")
n_constraints = re.findall(
r'\b(?:1|10)\^{?\d+}?\s*(?:<=|<)\s*(?:N|n|Q|q)\s*(?:<=|<)\s*(?:1|10)\^{?\d+}?',
constraints_text
)
# Extract sample test cases
samples = []
sample_inputs = soup.find_all("pre", class_=re.compile("sample-input"))
sample_outputs = soup.find_all("pre", class_=re.compile("sample-output"))
for inp, out in zip(sample_inputs, sample_outputs):
samples.append({
"input": inp.get_text(),
"output": out.get_text(),
})
return {
**sections,
"n_constraints": n_constraints,
"sample_cases": samples,
}
Building a Difficulty Distribution Dataset
One useful analysis: map difficulty to success ratio across all tracks to find which problem categories are hardest in practice:
import json
import statistics
from pathlib import Path
from collections import defaultdict
def analyze_difficulty_distribution(challenges_file: str) -> dict:
data = json.loads(Path(challenges_file).read_text())
by_difficulty = defaultdict(list)
by_track = defaultdict(lambda: defaultdict(list))
for ch in data:
diff = ch.get("difficulty", "Unknown")
ratio = ch.get("success_ratio")
track = ch.get("track", "unknown")
if ratio is not None:
by_difficulty[diff].append(float(ratio))
by_track[track][diff].append(float(ratio))
report = {}
print(f"{'Difficulty':<12} {'Count':>6} {'Avg Success':>12} {'Median':>8}")
print("-" * 42)
for diff in ["Easy", "Medium", "Hard", "Expert"]:
ratios = by_difficulty.get(diff, [])
if not ratios:
continue
avg = statistics.mean(ratios)
med = statistics.median(ratios)
print(f"{diff:<12} {len(ratios):>6} {avg:>11.1f}% {med:>7.1f}%")
report[diff] = {"count": len(ratios), "avg_success": avg, "median_success": med}
return report
# Run analysis on collected data
# analyze_difficulty_distribution("hackerrank_challenges.json")
Bulk Scraping All Tracks
Here is a complete pipeline that collects all challenges across every domain:
#!/usr/bin/env python3
"""
HackerRank challenge dataset builder.
Collects challenge metadata across all major tracks and saves to JSON.
"""
import httpx
import json
import time
from pathlib import Path
from datetime import datetime
BASE = "https://www.hackerrank.com"
OUTPUT_DIR = Path("hackerrank_data")
OUTPUT_DIR.mkdir(exist_ok=True)
ALL_TRACKS = [
"algorithms", "data-structures", "mathematics",
"artificial-intelligence", "databases", "shell",
"python", "java", "c", "cpp", "javascript",
"ruby", "linux-shell", "distributed-systems",
]
HEADERS = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
"AppleWebKit/537.36 Chrome/131.0.0.0 Safari/537.36",
"Accept": "application/json",
"Referer": "https://www.hackerrank.com/",
}
def fetch_all_challenges_for_track(track: str) -> list[dict]:
all_ch = []
offset = 0
limit = 50
with httpx.Client() as client:
while True:
try:
resp = client.get(
f"{BASE}/rest/contests/master/challenges",
params={"offset": offset, "limit": limit, "track": track},
headers=HEADERS,
timeout=20,
)
if resp.status_code == 429:
time.sleep(30)
continue
resp.raise_for_status()
data = resp.json()
except Exception as e:
print(f" Error fetching {track} at offset {offset}: {e}")
break
models = data.get("models", [])
if not models:
break
for ch in models:
all_ch.append({
"slug": ch.get("slug"),
"name": ch.get("name"),
"difficulty": ch.get("difficulty_name"),
"track": track,
"max_score": ch.get("max_score"),
"success_ratio": ch.get("success_ratio"),
"total_submissions": ch.get("total_count"),
"tags": [t.get("name") for t in ch.get("tags", [])],
"primary_technology": ch.get("primary_technology"),
"url": f"https://www.hackerrank.com/challenges/{ch.get('slug')}/problem",
})
total = data.get("total", 0)
offset += limit
if offset >= total or offset >= 1000:
break
time.sleep(1.5)
return all_ch
def run_full_collection():
all_challenges = []
stats = {}
for track in ALL_TRACKS:
print(f"Collecting: {track}")
challenges = fetch_all_challenges_for_track(track)
all_challenges.extend(challenges)
stats[track] = len(challenges)
print(f" {len(challenges)} challenges collected")
time.sleep(3) # pause between tracks
# Save full dataset
timestamp = datetime.now().strftime("%Y%m%d_%H%M")
out_file = OUTPUT_DIR / f"hackerrank_all_{timestamp}.json"
out_file.write_text(json.dumps({
"collected_at": datetime.now().isoformat(),
"total_challenges": len(all_challenges),
"by_track": stats,
"challenges": all_challenges,
}, indent=2))
print(f"\nComplete! {len(all_challenges)} challenges saved to {out_file}")
for track, count in sorted(stats.items(), key=lambda x: x[1], reverse=True):
print(f" {track:<30} {count:>4} challenges")
if __name__ == "__main__":
run_full_collection()
Use Cases
With this data collected, you can build:
Practice schedulers — Sort problems by success_ratio and difficulty to build a progressive curriculum. Start with Easy problems that have 80%+ success rates, then move to Medium problems with 40-60% rates.
Skill gap analysis — Compare difficulty distributions across tracks. AI/ML challenges tend to have lower success rates than equivalent-difficulty Algorithms problems.
Hiring signal research — Many companies use HackerRank for technical screening. The most-attempted problems in certain domains reveal what skills companies test for.
Contest trackers — Monitor upcoming contests, participant counts, and winner data to identify when activity spikes in the competitive programming community.
Study plan generators — Given a target company or role, recommend a sequence of HackerRank problems that match that company's screening preferences.
Rate Limits and Staying Unblocked
HackerRank applies IP-level rate limiting. The thresholds are approximately:
| Request Pattern | Result |
|---|---|
| 1 request/second sustained | Works fine |
| 3-5 requests/second burst | Occasional 429 |
| 10+ requests/second | Consistent 429s |
| No delays for 100+ requests | IP block (temporary) |
For sustainable high-volume collection, use ThorData's residential proxy pool. Rotating residential IPs means each request appears to come from a different user, effectively eliminating per-IP rate limits. With proxy rotation, you can safely increase your request rate to 5-10/second across the pool.
Always add jitter to your delays — fixed intervals (exactly 1.000s) are a bot signal. Randomize between 0.8 and 2.5 seconds for more human-like patterns.
Summary
HackerRank exposes a clean REST API that returns JSON for challenge listings, leaderboards, and contest data. No authentication is needed for most public data. The key parameters are the track slug and pagination offsets.
The main scraping challenges are rate limiting (solvable with residential proxies) and proper header configuration (a real Chrome User-Agent and Referer header is required). With both in place, you can collect the full challenge catalog, difficulty distributions, success rates, and leaderboard data for competitive programming research or study tool development.
Advanced: Contest Scraping and Historical Data
HackerRank hosts contests regularly. Building a historical contest dataset reveals difficulty curves, prize structures, and participation patterns:
import httpx
import json
import time
from pathlib import Path
from datetime import datetime
BASE = "https://www.hackerrank.com"
HEADERS = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 Chrome/131.0.0.0 Safari/537.36",
"Accept": "application/json",
"Referer": "https://www.hackerrank.com/",
}
def fetch_past_contests(max_pages: int = 10) -> list[dict]:
all_contests = []
offset = 0
limit = 20
with httpx.Client() as client:
while offset < max_pages * limit:
resp = client.get(
f"{BASE}/rest/contests",
params={"filter": "past", "offset": offset, "limit": limit},
headers=HEADERS,
timeout=20,
)
if resp.status_code == 429:
time.sleep(30)
continue
resp.raise_for_status()
data = resp.json()
models = data.get("models", [])
if not models:
break
for c in models:
all_contests.append({
"slug": c.get("slug"),
"name": c.get("name"),
"start_time": c.get("epoch_starttime"),
"end_time": c.get("epoch_endtime"),
"description": c.get("description", "")[:200],
"participant_count": c.get("total_participants", 0),
"challenge_count": c.get("challenges_count", 0),
})
offset += limit
total = data.get("total", 0)
print(f" Fetched {len(all_contests)}/{total} contests")
if offset >= total:
break
time.sleep(2)
return all_contests
def fetch_contest_leaderboard(contest_slug: str, max_entries: int = 100) -> list[dict]:
entries = []
offset = 0
limit = 25
with httpx.Client() as client:
while offset < max_entries:
resp = client.get(
f"{BASE}/rest/contests/{contest_slug}/leaderboard",
params={"offset": offset, "limit": limit},
headers=HEADERS,
timeout=20,
)
if resp.status_code != 200:
break
data = resp.json()
models = data.get("models", [])
if not models:
break
for entry in models:
entries.append({
"rank": entry.get("rank"),
"hacker": entry.get("hacker"),
"score": entry.get("score"),
"time_taken": entry.get("time_taken"),
})
offset += limit
if offset >= data.get("total", 0):
break
time.sleep(1)
return entries
Tracking Problem Difficulty Over Time
One useful insight from HackerRank data: comparing stated difficulty labels against actual success rates reveals that "Medium" problems vary enormously in practice:
import json
import statistics
from pathlib import Path
from collections import defaultdict
def analyze_difficulty_vs_success(challenges_file: str) -> None:
data = json.loads(Path(challenges_file).read_text())
challenges = data if isinstance(data, list) else data.get("challenges", [])
by_difficulty = defaultdict(list)
by_track_difficulty = defaultdict(lambda: defaultdict(list))
for ch in challenges:
diff = ch.get("difficulty", "Unknown")
ratio = ch.get("success_ratio")
track = ch.get("track", "unknown")
if ratio is not None:
try:
r = float(ratio)
by_difficulty[diff].append(r)
by_track_difficulty[track][diff].append(r)
except (ValueError, TypeError):
pass
print("Difficulty vs. Actual Success Rate:")
print(f"{'Difficulty':<12} {'Count':>6} {'Avg%':>8} {'Median%':>9} {'Min%':>7} {'Max%':>7}")
print("-" * 52)
for diff in ["Easy", "Medium", "Hard", "Expert"]:
ratios = by_difficulty.get(diff, [])
if not ratios:
continue
print(
f"{diff:<12} {len(ratios):>6} {statistics.mean(ratios):>7.1f}% "
f"{statistics.median(ratios):>8.1f}% "
f"{min(ratios):>6.1f}% "
f"{max(ratios):>6.1f}%"
)
print("\nHardest tracks by average success rate:")
track_avgs = []
for track, diff_data in by_track_difficulty.items():
all_ratios = [r for ratios in diff_data.values() for r in ratios]
if all_ratios:
track_avgs.append((track, statistics.mean(all_ratios), len(all_ratios)))
for track, avg, count in sorted(track_avgs, key=lambda x: x[1])[:10]:
print(f" {track:<30} {avg:>6.1f}% avg success ({count} challenges)")
Extracting Hiring Signal Data
Many companies use HackerRank for technical interviews. The platforms they test on and the problem types they favor reveal hiring preferences:
import re
from bs4 import BeautifulSoup
def extract_interview_kit_companies() -> list[dict]:
"""
HackerRank's Interview Preparation Kit page lists companies
that commonly use specific problem types for screening.
Scrape the interview kit category structure.
"""
url = "https://www.hackerrank.com/interview/interview-preparation-kit"
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 Chrome/131.0.0.0 Safari/537.36",
}
import httpx
resp = httpx.get(url, headers=headers, timeout=20)
if resp.status_code != 200:
return []
soup = BeautifulSoup(resp.text, "html.parser")
# The prep kit page contains topic names and their challenge counts
topics = []
for section in soup.select(".challenge-category"):
name_el = section.select_one("h3")
count_el = section.select_one(".challenge-count")
if name_el:
topics.append({
"topic": name_el.get_text(strip=True),
"challenge_count": count_el.get_text(strip=True) if count_el else None,
})
return topics
def get_company_tagged_challenges(company_name: str) -> list[dict]:
"""Fetch challenges tagged with a specific company name."""
# Company-tagged problems appear in the URL pattern:
# /challenges?filters%5Bcompanies%5D%5B%5D=COMPANY_NAME
import httpx, time
resp = httpx.get(
"https://www.hackerrank.com/rest/contests/master/challenges",
params={
"filters[companies][]": company_name,
"offset": 0,
"limit": 50,
},
headers=HEADERS,
timeout=20,
)
if resp.status_code != 200:
return []
challenges = []
for ch in resp.json().get("models", []):
challenges.append({
"slug": ch.get("slug"),
"name": ch.get("name"),
"difficulty": ch.get("difficulty_name"),
"success_ratio": ch.get("success_ratio"),
"total_submissions": ch.get("total_count"),
})
return challenges
Building a Study Planner
Using collected challenge data, build a personalized study planner that sequences problems optimally:
import json
from pathlib import Path
from collections import defaultdict
def build_study_plan(
challenges_file: str,
target_track: str = "algorithms",
daily_problems: int = 3,
days: int = 30,
) -> list[dict]:
data = json.loads(Path(challenges_file).read_text())
challenges = data if isinstance(data, list) else data.get("challenges", [])
# Filter to target track
track_challenges = [
ch for ch in challenges
if ch.get("track") == target_track
]
# Sequence by difficulty: Easy -> Medium -> Hard -> Expert
difficulty_order = {"Easy": 0, "Medium": 1, "Hard": 2, "Expert": 3}
track_challenges.sort(key=lambda x: (
difficulty_order.get(x.get("difficulty", "Medium"), 1),
-(x.get("success_ratio") or 50), # within each level, start with higher success rate
))
# Build daily plan
plan = []
day = 1
idx = 0
while day <= days and idx < len(track_challenges):
daily = []
for _ in range(daily_problems):
if idx >= len(track_challenges):
break
ch = track_challenges[idx]
daily.append({
"name": ch.get("name"),
"slug": ch.get("slug"),
"difficulty": ch.get("difficulty"),
"success_rate": ch.get("success_ratio"),
"url": f"https://www.hackerrank.com/challenges/{ch.get('slug')}/problem",
})
idx += 1
if daily:
plan.append({"day": day, "problems": daily})
day += 1
return plan
def print_study_plan_week(plan: list[dict]) -> None:
print("30-Day HackerRank Study Plan (first week):")
print()
for day_plan in plan[:7]:
day = day_plan["day"]
print(f"Day {day}:")
for p in day_plan["problems"]:
rate = f"{p['success_rate']:.0f}%" if p.get('success_rate') else "N/A"
print(f" - {p['name']} ({p['difficulty']}, {rate} success rate)")
print()
Saving and Exporting Data
import json
import csv
from pathlib import Path
from datetime import datetime
def save_challenge_dataset(
challenges: list[dict],
output_dir: str = "hackerrank_data",
) -> None:
out = Path(output_dir)
out.mkdir(exist_ok=True)
timestamp = datetime.now().strftime("%Y%m%d_%H%M")
# JSON
json_file = out / f"challenges_{timestamp}.json"
json_file.write_text(json.dumps({
"collected_at": datetime.now().isoformat(),
"total": len(challenges),
"challenges": challenges,
}, indent=2))
# CSV
csv_file = out / f"challenges_{timestamp}.csv"
fieldnames = ["slug", "name", "difficulty", "track", "max_score",
"success_ratio", "total_submissions", "url"]
with open(csv_file, "w", newline="", encoding="utf-8") as f:
writer = csv.DictWriter(f, fieldnames=fieldnames, extrasaction="ignore")
writer.writeheader()
writer.writerows(challenges)
print(f"Saved {len(challenges)} challenges:")
print(f" JSON: {json_file}")
print(f" CSV: {csv_file}")
def export_leaderboard_analysis(
challenge_slug: str,
leaderboard: list[dict],
output_dir: str = "hackerrank_data",
) -> None:
from collections import Counter
out = Path(output_dir)
out.mkdir(exist_ok=True)
# Language distribution
lang_counts = Counter(entry.get("language", "Unknown") for entry in leaderboard)
# Score distribution
scores = [entry.get("score", 0) for entry in leaderboard if entry.get("score") is not None]
analysis = {
"challenge": challenge_slug,
"total_entries": len(leaderboard),
"language_distribution": dict(lang_counts.most_common(10)),
"score_stats": {
"min": min(scores) if scores else None,
"max": max(scores) if scores else None,
"avg": sum(scores) / len(scores) if scores else None,
},
"perfect_scores": sum(1 for s in scores if s == max(scores) if scores),
}
out_file = out / f"leaderboard_{challenge_slug}.json"
out_file.write_text(json.dumps(analysis, indent=2))
print(f"Leaderboard analysis saved to {out_file}")
print(f" Top languages: {dict(lang_counts.most_common(5))}")
Combining with Other Job Data
HackerRank challenge data becomes most valuable when cross-referenced with job market data. Companies that heavily tag problems with their names are often actively using those problem types in screening:
import json
from pathlib import Path
from collections import defaultdict
def match_challenges_to_job_requirements(
challenges_file: str,
jobs_file: str,
) -> dict:
"""Cross-reference HackerRank problems with LinkedIn job descriptions."""
challenges = json.loads(Path(challenges_file).read_text())
if isinstance(challenges, dict):
challenges = challenges.get("challenges", [])
jobs = json.loads(Path(jobs_file).read_text())
# Build skill tag index from challenges
skill_index = defaultdict(list)
for ch in challenges:
for tag in ch.get("tags", []):
skill_index[tag.lower()].append(ch)
# Find which challenge tags appear in job descriptions
skill_demand = defaultdict(int)
for job in jobs:
desc = (job.get("description", "") or "").lower()
for skill_tag, tagged_challenges in skill_index.items():
if skill_tag in desc:
skill_demand[skill_tag] += 1
# Return skills ranked by job market demand
ranked = sorted(skill_demand.items(), key=lambda x: x[1], reverse=True)
print("HackerRank topics most demanded in job descriptions:")
for skill, count in ranked[:15]:
tagged = skill_index[skill]
print(f" {skill:<30} mentioned in {count} job descriptions ({len(tagged)} challenges)")
return dict(ranked)
Conclusion
HackerRank's REST API is unusually accessible for a major platform. Challenge metadata, leaderboards, and contest data are available without authentication. With proper rate limiting (1-2 second delays, residential proxy rotation from ThorData for high-volume use), you can build comprehensive datasets covering the full challenge catalog across all tracks.
The data enables applications ranging from personalized study planners and difficulty analysis tools to hiring intelligence platforms that map problem types to company screening preferences. The success ratio field alone — the percentage of all-time attempts that produced correct solutions — provides a ground-truth difficulty signal that is far more reliable than the subjective Easy/Medium/Hard labels.
Automated Progress Tracking
Track your own progress through HackerRank challenges using the authenticated API endpoints and build a personal dashboard:
import json
import time
from pathlib import Path
from datetime import datetime
from collections import defaultdict
def build_personal_progress_tracker(
session_id: str,
csrf_token: str,
output_dir: str = "progress_tracking",
) -> None:
import httpx
out = Path(output_dir)
out.mkdir(exist_ok=True)
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 Chrome/131.0.0.0 Safari/537.36",
"Accept": "application/json",
"X-CSRF-Token": csrf_token,
}
cookies = {"_hrank_session": session_id}
progress_data = {}
# Fetch solved challenges per track
tracks = ["algorithms", "data-structures", "mathematics", "python", "databases"]
for track in tracks:
resp = httpx.get(
"https://www.hackerrank.com/rest/contests/master/challenges",
params={"track": track, "limit": 50, "offset": 0, "solved": 1},
headers=headers,
cookies=cookies,
timeout=20,
)
if resp.status_code != 200:
continue
data = resp.json()
solved = data.get("models", [])
progress_data[track] = {
"solved": len(solved),
"total": data.get("total", 0),
"challenges": [
{"slug": ch.get("slug"), "name": ch.get("name"), "difficulty": ch.get("difficulty_name")}
for ch in solved
],
}
print(f" {track}: {len(solved)} solved")
time.sleep(1.5)
# Save progress snapshot
snapshot = {
"timestamp": datetime.now().isoformat(),
"tracks": progress_data,
"total_solved": sum(d["solved"] for d in progress_data.values()),
}
out_file = out / f"progress_{datetime.now().strftime('%Y%m%d_%H%M')}.json"
out_file.write_text(json.dumps(snapshot, indent=2))
print(f"Progress snapshot saved: {out_file}")
def visualize_progress_over_time(progress_dir: str) -> None:
files = sorted(Path(progress_dir).glob("progress_*.json"))
if not files:
print("No progress snapshots found")
return
timeline = []
for f in files:
data = json.loads(f.read_text())
timeline.append({
"timestamp": data["timestamp"],
"total_solved": data["total_solved"],
"by_track": {track: info["solved"] for track, info in data["tracks"].items()},
})
print("Progress over time:")
print(f"{'Date':<12} {'Total':>7} {'Change':>7}")
print("-" * 30)
for i, point in enumerate(timeline):
date = point["timestamp"][:10]
total = point["total_solved"]
change = total - timeline[i-1]["total_solved"] if i > 0 else 0
change_str = f"+{change}" if change >= 0 else str(change)
print(f"{date:<12} {total:>7} {change_str:>7}")
API Reference Summary
Quick reference for all HackerRank endpoints used in this guide:
| Endpoint | Description | Auth Required |
|---|---|---|
/rest/contests/master/challenges |
List challenges by track | No |
/rest/contests/master/challenges/{slug} |
Challenge detail | No |
/rest/contests/master/challenges/{slug}/leaderboard |
Per-challenge leaderboard | No |
/rest/contests |
List contests | No |
/rest/contests/{slug}/challenges |
Contest challenges | No |
/rest/contests/{slug}/leaderboard |
Contest leaderboard | No |
/rest/users/{username} |
Public user profile | No |
/rest/contests/master/challenges?solved=1 |
Your solved challenges | Yes |
/rest/contests/master/challenges/{slug}/submissions |
Your submissions | Yes |
The unauthenticated endpoints are the richest source of data and require only proper User-Agent and Referer headers. For the authenticated endpoints, you need a valid _hrank_session cookie and X-CSRF-Token header obtained by logging in through a browser.
For high-volume data collection, use ThorData's residential proxy pool to distribute requests across multiple IPs and avoid the per-IP rate limiting that triggers after roughly 60-100 requests per minute.
Tracking HackerRank Hiring Challenges Over Time
Companies regularly publish "HackerRank Certification Tests" and hiring challenges under their own subdomains (e.g., company.hackerrank.com). Monitoring newly published contests is valuable for job seekers and competitive intelligence researchers.
import httpx
import json
from datetime import datetime, timedelta
def fetch_active_contests(min_age_days: int = 0, max_age_days: int = 30) -> list[dict]:
"""Fetch contests that started within a date window."""
url = "https://www.hackerrank.com/rest/contests"
params = {
"offset": 0,
"limit": 100,
"sort_by": "created_at",
"sort_order": "desc",
}
headers = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7)",
"Accept": "application/json",
}
cutoff_newer = datetime.utcnow() - timedelta(days=min_age_days)
cutoff_older = datetime.utcnow() - timedelta(days=max_age_days)
results = []
with httpx.Client(headers=headers, timeout=15) as client:
while True:
resp = client.get(url, params=params)
resp.raise_for_status()
data = resp.json()
contests = data.get("models", [])
if not contests:
break
for contest in contests:
created_str = contest.get("created_at", "")
try:
created = datetime.fromisoformat(created_str.replace("Z", "+00:00"))
created = created.replace(tzinfo=None)
except Exception:
continue
if created < cutoff_older:
return results # Too old, stop paginating
if created <= cutoff_newer:
results.append({
"slug": contest.get("slug"),
"name": contest.get("name"),
"description": contest.get("description", "")[:200],
"created_at": created_str,
"end_time": contest.get("end_time"),
"is_hiring": "hiring" in contest.get("name", "").lower()
or "interview" in contest.get("name", "").lower(),
})
params["offset"] += len(contests)
return results
if __name__ == "__main__":
contests = fetch_active_contests(max_age_days=7)
hiring = [c for c in contests if c["is_hiring"]]
print(f"Found {len(contests)} recent contests, {len(hiring)} look like hiring challenges")
for c in hiring[:5]:
print(f" {c['name']} — {c['slug']}")
Building a HackerRank Study Progress Tracker
When preparing for technical interviews, systematic tracking beats ad-hoc practice. Combine the challenge listing scraper with a local SQLite database:
import sqlite3
from pathlib import Path
from datetime import datetime
DB_PATH = Path.home() / ".hackerrank_tracker.db"
def init_tracker():
conn = sqlite3.connect(DB_PATH)
conn.executescript("""
CREATE TABLE IF NOT EXISTS challenges (
slug TEXT PRIMARY KEY,
name TEXT,
difficulty TEXT,
domain TEXT,
subdomain TEXT,
score INTEGER,
success_ratio REAL
);
CREATE TABLE IF NOT EXISTS attempts (
id INTEGER PRIMARY KEY AUTOINCREMENT,
challenge_slug TEXT,
attempted_at TEXT,
solved INTEGER DEFAULT 0,
notes TEXT,
FOREIGN KEY (challenge_slug) REFERENCES challenges(slug)
);
CREATE VIEW IF NOT EXISTS study_progress AS
SELECT c.domain, c.subdomain, c.difficulty,
COUNT(DISTINCT c.slug) AS total_challenges,
COUNT(DISTINCT CASE WHEN a.solved=1 THEN a.challenge_slug END) AS solved,
ROUND(100.0 * COUNT(DISTINCT CASE WHEN a.solved=1 THEN a.challenge_slug END)
/ COUNT(DISTINCT c.slug), 1) AS pct_complete
FROM challenges c
LEFT JOIN attempts a ON c.slug = a.challenge_slug
GROUP BY c.domain, c.subdomain, c.difficulty
ORDER BY c.domain, c.subdomain, c.difficulty;
""")
conn.commit()
return conn
def mark_solved(conn, slug: str, notes: str = ""):
conn.execute(
"INSERT INTO attempts (challenge_slug, attempted_at, solved, notes) VALUES (?,?,1,?)",
(slug, datetime.utcnow().isoformat(), notes)
)
conn.commit()
print(f"Marked {slug} as solved.")
def show_progress(conn, domain: str = None):
query = "SELECT * FROM study_progress"
params = []
if domain:
query += " WHERE domain = ?"
params.append(domain)
rows = conn.execute(query, params).fetchall()
print(f"{'Domain':<20} {'Subdomain':<25} {'Difficulty':<12} {'Solved/Total':<15} {'%'}")
print("-" * 80)
for row in rows:
print(f"{row[0]:<20} {row[1]:<25} {row[2]:<12} {row[4]}/{row[3]:<15} {row[5]}%")
Run show_progress(conn, domain="Algorithms") after each study session to see your completion rate per subdomain. This pairs naturally with the bulk challenge scraper to auto-populate the challenges table from HackerRank's public API.
Rate Limit Reference and Proxy Strategy
HackerRank's public API is relatively permissive but still enforces rate limits:
| Endpoint | Observed Limit | Notes |
|---|---|---|
/rest/contests/master/challenges |
~60 req/min | Lower for unauthenticated |
/rest/contests/{slug}/challenges |
~30 req/min | Per-contest |
/rest/hackers/{username}/recent_challenges |
~20 req/min | Profile endpoints |
| Challenge detail pages | ~10 req/min | HTML pages, stricter |
For bulk scraping, rotate IPs using ThorData residential proxies to avoid hitting per-IP limits:
import itertools
PROXY_POOL = [
f"http://user-{i}:[email protected]:9000"
for i in range(1, 6)
]
proxy_cycle = itertools.cycle(PROXY_POOL)
def get_client_with_proxy() -> httpx.Client:
proxy_url = next(proxy_cycle)
return httpx.Client(
proxies={"https://": proxy_url, "http://": proxy_url},
headers={"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"},
timeout=15,
)
Residential IPs from ThorData provide clean exit nodes that haven't been flagged by HackerRank's bot detection systems.