How to Scrape Duolingo Course Data via Undocumented API (2026)
How to Scrape Duolingo Course Data via Undocumented API (2026)
Duolingo has no public API. They shut down their old API in 2023 and never replaced it. But the mobile app and website still talk to backend endpoints — and those endpoints are accessible if you know where to look.
This guide covers Duolingo's undocumented API endpoints that you can use to extract course data, language pairs, user profiles, streaks, and leaderboard information. All discovered through reverse-engineering the web app's network traffic.
How the Endpoints Were Found
Open Duolingo in Chrome, open DevTools → Network tab, and browse around. The web app makes GraphQL and REST calls to www.duolingo.com endpoints. The authentication token is a JWT stored in a cookie. Some endpoints work without auth, others require it.
Setup
pip install requests
No browser automation needed — these are plain HTTP endpoints.
Public Endpoints (No Auth Required)
Course Catalog
Duolingo's course data is partially available without authentication:
import requests
import json
def get_duolingo_courses():
"""Fetch all available Duolingo courses (language pairs)."""
url = "https://www.duolingo.com/api/1/courses/list"
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36",
"Accept": "application/json"
}
response = requests.get(url, headers=headers, timeout=15)
if response.status_code != 200:
# Fallback: scrape the courses page
return get_courses_from_web()
data = response.json()
courses = []
for course in data:
courses.append({
"learning_language": course.get("learning_language"),
"learning_language_name": course.get("learning_language_name"),
"from_language": course.get("from_language"),
"from_language_name": course.get("from_language_name"),
"num_learners": course.get("num_learners"),
"phase": course.get("phase") # 1=hatching, 2=beta, 3=stable
})
return sorted(courses, key=lambda x: x.get("num_learners", 0), reverse=True)
def get_courses_from_web():
"""Fallback: scrape course data from the incubator/courses page."""
from bs4 import BeautifulSoup
url = "https://www.duolingo.com/courses"
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
}
response = requests.get(url, headers=headers, timeout=15)
soup = BeautifulSoup(response.text, "lxml")
# Duolingo embeds course data in a __NEXT_DATA__ script tag
next_data = soup.find("script", {"id": "__NEXT_DATA__"})
if next_data:
data = json.loads(next_data.string)
# Navigate the nested structure (changes periodically)
props = data.get("props", {}).get("pageProps", {})
return props.get("courses", [])
return []
courses = get_duolingo_courses()
print(f"Found {len(courses)} language courses")
for c in courses[:10]:
print(f" {c['learning_language_name']} for {c['from_language_name']} speakers — {c.get('num_learners', 'N/A')} learners")
User Profiles
Public user profiles are accessible without authentication:
def get_user_profile(username):
"""Fetch a Duolingo user's public profile."""
url = f"https://www.duolingo.com/2017-06-30/users?username={username}&fields=streak,totalXp,currentCourseId,courses,streakData,creationDate"
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36",
"Accept": "application/json"
}
response = requests.get(url, headers=headers, timeout=15)
if response.status_code != 200:
return {"error": f"HTTP {response.status_code}"}
data = response.json()
users = data.get("users", [])
if not users:
return {"error": "User not found"}
user = users[0]
return {
"username": username,
"streak": user.get("streak"),
"total_xp": user.get("totalXp"),
"creation_date": user.get("creationDate"),
"current_course": user.get("currentCourseId"),
"courses": [
{
"id": c.get("id"),
"title": c.get("title"),
"xp": c.get("xp"),
"crowns": c.get("crowns"),
"level": c.get("level", {}).get("level") if isinstance(c.get("level"), dict) else c.get("level")
}
for c in user.get("courses", [])
]
}
profile = get_user_profile("LuisvonAhn")
print(json.dumps(profile, indent=2))
Authenticated Endpoints
Some endpoints require a valid session. You can get one by logging in:
def duolingo_login(username, password):
"""Authenticate with Duolingo and get a session token."""
url = "https://www.duolingo.com/login"
payload = {
"login": username,
"password": password
}
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
"Content-Type": "application/json"
}
session = requests.Session()
response = session.post(url, json=payload, headers=headers, timeout=15)
if response.status_code == 200:
# JWT token is in the response and set as cookie
token = response.json().get("jwt") or response.headers.get("jwt")
return session, token
else:
return None, None
def get_skill_tree(session, user_id, course_id):
"""Get the full skill tree for a course (requires auth)."""
url = f"https://www.duolingo.com/2017-06-30/users/{user_id}/courses/{course_id}"
params = {
"fields": "skills,sections,currentSection"
}
response = session.get(url, params=params, timeout=15)
if response.status_code != 200:
return None
data = response.json()
skills = []
for skill in data.get("skills", []):
skills.append({
"name": skill.get("name"),
"short_name": skill.get("shortName"),
"levels": skill.get("levels"),
"lessons": skill.get("lessons"),
"words": skill.get("words", []),
"tips_available": skill.get("tipsAndNotes") is not None
})
return skills
Leaderboard Data
The leaderboard API reveals top users and their XP:
def get_leaderboard(session, user_id):
"""Fetch the user's current leaderboard."""
url = f"https://www.duolingo.com/leaderboards/7d9f5dd1-8423-4e7c-8c68-1ca2f2f2d1f0/users/{user_id}"
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
"Accept": "application/json"
}
response = session.get(url, headers=headers, timeout=15)
if response.status_code != 200:
return None
data = response.json()
return {
"tier": data.get("tier"),
"cohort": [
{
"display_name": u.get("display_name"),
"total_xp": u.get("total_xp_this_period"),
"streak": u.get("streak"),
"has_plus": u.get("has_plus")
}
for u in data.get("active", [])
]
}
Streak and Activity Data
Streak information is one of the most interesting public data points:
def get_streak_info(username):
"""Get detailed streak data for a user."""
url = f"https://www.duolingo.com/2017-06-30/users?username={username}&fields=streak,streakData,practiceReminderSettings,currentCourseId"
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
"Accept": "application/json"
}
response = requests.get(url, headers=headers, timeout=15)
data = response.json()
users = data.get("users", [])
if not users:
return None
user = users[0]
streak_data = user.get("streakData", {})
return {
"username": username,
"current_streak": user.get("streak"),
"longest_streak": streak_data.get("longestStreak", {}).get("length"),
"longest_streak_start": streak_data.get("longestStreak", {}).get("startDate"),
"longest_streak_end": streak_data.get("longestStreak", {}).get("endDate"),
"streak_freeze_used": streak_data.get("currentStreak", {}).get("lastExtendedDate")
}
streak = get_streak_info("LuisvonAhn")
print(json.dumps(streak, indent=2))
Batch Scraping Multiple Users
If you're collecting data across many users, batch your requests carefully:
import time
import random
def batch_scrape_users(usernames, delay_range=(2.0, 4.0)):
"""Scrape profile data for a list of usernames."""
results = []
for i, username in enumerate(usernames):
try:
profile = get_user_profile(username)
streak = get_streak_info(username)
if streak:
profile.update({
"longest_streak": streak.get("longest_streak"),
"streak_freeze_used": streak.get("streak_freeze_used")
})
results.append(profile)
except Exception as e:
print(f"Error for {username}: {e}")
results.append({"username": username, "error": str(e)})
if (i + 1) % 10 == 0:
print(f"Scraped {i + 1}/{len(usernames)} users")
time.sleep(random.uniform(*delay_range))
return results
Anti-Bot Measures and Rate Limits
Duolingo's API endpoints have specific protections you need to understand:
Rate limiting: The public endpoints allow roughly 30-40 requests per minute from a single IP before returning 429 errors. The 2017-06-30 versioned API is more generous than the newer endpoints.
JWT validation: Authenticated endpoints check JWT expiry strictly. Tokens expire after roughly 30 minutes of inactivity. Re-authenticate when you get 401 responses.
Cloudflare protection: The web pages (not API) sit behind Cloudflare. If you're scraping the HTML as a fallback, you'll hit JavaScript challenges. The API endpoints bypass most of this since they return JSON directly.
IP reputation: Duolingo tracks request patterns per IP. Sustained automated access from datacenter IPs gets flagged within minutes. For any batch data collection, residential proxies are the way to go. ThorData gives you rotating residential IPs that look like regular app users connecting from home networks — which is exactly what Duolingo expects to see.
def create_proxy_session(proxy_url):
"""Create a requests session routed through a proxy."""
session = requests.Session()
session.proxies = {
"http": proxy_url,
"https": proxy_url
}
session.headers.update({
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36",
"Accept": "application/json"
})
return session
# Usage:
# session = create_proxy_session("http://user:[email protected]:9000")
# profile = session.get(f"https://www.duolingo.com/2017-06-30/users?username=someuser")
Fingerprint consistency: When using proxies, make sure your User-Agent and other headers are consistent within a session. Duolingo correlates headers with IP — a residential IP sending requests with a bot-like User-Agent gets flagged faster than a datacenter IP with a browser User-Agent.
Monitoring for API Changes
Undocumented APIs break without warning. Build in resilience:
def safe_api_call(url, headers=None, max_retries=3):
"""Make an API call with retry logic and change detection."""
default_headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
"Accept": "application/json"
}
if headers:
default_headers.update(headers)
for attempt in range(max_retries):
try:
response = requests.get(url, headers=default_headers, timeout=15)
if response.status_code == 200:
return response.json()
elif response.status_code == 404:
print(f"Endpoint may have changed: {url}")
return None
elif response.status_code == 429:
wait = (2 ** attempt) * 5
print(f"Rate limited, waiting {wait}s")
time.sleep(wait)
else:
print(f"HTTP {response.status_code} for {url}")
time.sleep(2)
except requests.exceptions.Timeout:
time.sleep(2)
return None
Data Export
import csv
def export_user_data(users, filename="duolingo_users.csv"):
"""Export user profile data to CSV."""
if not users:
return
flat = []
for u in users:
if "error" in u:
continue
flat.append({
"username": u.get("username"),
"streak": u.get("streak"),
"total_xp": u.get("total_xp"),
"creation_date": u.get("creation_date"),
"current_course": u.get("current_course"),
"num_courses": len(u.get("courses", [])),
"longest_streak": u.get("longest_streak")
})
if not flat:
return
with open(filename, "w", newline="", encoding="utf-8") as f:
writer = csv.DictWriter(f, fieldnames=flat[0].keys())
writer.writeheader()
writer.writerows(flat)
print(f"Exported {len(flat)} users to {filename}")
Endpoint Reference
Here's a summary of the undocumented endpoints covered in this guide:
| Endpoint | Auth | Returns |
|---|---|---|
/api/1/courses/list |
No | All available language courses |
/2017-06-30/users?username=X |
No | Public user profile, streak, XP |
/login |
POST | JWT session token |
/2017-06-30/users/{id}/courses/{id} |
Yes | Skill tree, lessons, words |
/leaderboards/.../users/{id} |
Yes | Leaderboard tier and cohort |
These endpoints have been stable since mid-2024 but can change at any time. The versioned endpoints (2017-06-30) tend to be more stable than unversioned ones.
Legal and Ethical Notes
Duolingo's Terms of Service prohibit automated access. These endpoints are undocumented and unsupported — Duolingo can change or block them at any time.
This code is for educational purposes and personal research. Don't use it to build competing products, harvest user data at scale, or interfere with Duolingo's services. If you're collecting user data, stick to genuinely public profiles and respect people's privacy.
Collecting Language Learning Statistics at Scale
If you want to analyze Duolingo's learner base systematically, here is a structured approach for collecting public user data responsibly:
import requests
import json
import time
import random
import sqlite3
from datetime import datetime
BASE_URL = "https://www.duolingo.com"
HEADERS = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36",
"Accept": "application/json",
"Accept-Language": "en-US,en;q=0.9",
}
def get_leaderboard_users(session: requests.Session = None) -> list:
"""
Collect users from public leaderboards.
Returns list of username strings.
"""
# The global leaderboard page exposes top users publicly
url = f"{BASE_URL}/2017-06-30/leaderboards"
headers = {**HEADERS}
if session:
resp = session.get(url, timeout=15)
else:
resp = requests.get(url, headers=headers, timeout=15)
if resp.status_code != 200:
return []
data = resp.json()
users = []
for lb in data.get("leaderboards", []):
for user in lb.get("cohort", []):
username = user.get("display_name")
if username:
users.append(username)
return list(set(users)) # Deduplicate
def get_course_statistics() -> dict:
"""Fetch aggregate statistics for all Duolingo courses."""
courses = get_duolingo_courses()
if not courses:
return {}
stats = {
"total_courses": len(courses),
"total_learners": sum(c.get("num_learners", 0) for c in courses),
"by_phase": {},
"by_learning_language": [],
"top_10_by_learners": sorted(
courses, key=lambda x: x.get("num_learners", 0), reverse=True
)[:10],
}
# Group by phase
from collections import Counter
phase_counts = Counter(c.get("phase") for c in courses)
stats["by_phase"] = dict(phase_counts)
# Languages sorted by learner count
lang_learners = {}
for c in courses:
lang = c.get("learning_language_name", "Unknown")
learners = c.get("num_learners", 0)
lang_learners[lang] = lang_learners.get(lang, 0) + learners
stats["by_learning_language"] = sorted(
[{"language": k, "total_learners": v} for k, v in lang_learners.items()],
key=lambda x: x["total_learners"],
reverse=True,
)
return stats
def init_duolingo_db(db_path: str = "duolingo.db") -> sqlite3.Connection:
conn = sqlite3.connect(db_path)
conn.executescript("""
CREATE TABLE IF NOT EXISTS users (
username TEXT PRIMARY KEY,
streak INTEGER,
total_xp INTEGER,
creation_date INTEGER,
current_course TEXT,
num_courses INTEGER,
longest_streak INTEGER,
scraped_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
);
CREATE TABLE IF NOT EXISTS user_courses (
username TEXT,
course_id TEXT,
course_title TEXT,
xp INTEGER,
crowns INTEGER,
level INTEGER,
PRIMARY KEY (username, course_id)
);
CREATE TABLE IF NOT EXISTS course_stats (
snapshot_date TEXT,
learning_language TEXT,
from_language TEXT,
num_learners INTEGER,
phase INTEGER,
PRIMARY KEY (snapshot_date, learning_language, from_language)
);
CREATE INDEX IF NOT EXISTS idx_users_streak
ON users(streak DESC);
CREATE INDEX IF NOT EXISTS idx_users_xp
ON users(total_xp DESC);
""")
conn.commit()
return conn
def save_user_profile(conn: sqlite3.Connection, profile: dict):
conn.execute(
"""INSERT OR REPLACE INTO users
(username, streak, total_xp, creation_date, current_course,
num_courses, longest_streak)
VALUES (?,?,?,?,?,?,?)""",
(
profile.get("username"),
profile.get("streak"),
profile.get("total_xp"),
profile.get("creation_date"),
profile.get("current_course"),
len(profile.get("courses", [])),
profile.get("longest_streak"),
),
)
for course in profile.get("courses", []):
conn.execute(
"""INSERT OR REPLACE INTO user_courses
(username, course_id, course_title, xp, crowns, level)
VALUES (?,?,?,?,?,?)""",
(
profile.get("username"),
course.get("id"),
course.get("title"),
course.get("xp"),
course.get("crowns"),
course.get("level"),
),
)
conn.commit()
Analyzing Learning Patterns
Once you have user data, you can analyze learning behavior:
import statistics
def analyze_learning_patterns(db_path: str = "duolingo.db") -> dict:
"""Analyze patterns in collected Duolingo user data."""
conn = sqlite3.connect(db_path)
# Streak distribution
streaks = [row[0] for row in conn.execute(
"SELECT streak FROM users WHERE streak IS NOT NULL AND streak > 0"
).fetchall()]
# XP distribution
xp_values = [row[0] for row in conn.execute(
"SELECT total_xp FROM users WHERE total_xp IS NOT NULL AND total_xp > 0"
).fetchall()]
# Most popular courses
popular_courses = conn.execute("""
SELECT course_title, COUNT(*) as user_count, AVG(xp) as avg_xp
FROM user_courses
WHERE course_title IS NOT NULL
GROUP BY course_title
ORDER BY user_count DESC
LIMIT 10
""").fetchall()
# Streak retention (users with streak > 30, > 100, > 365 days)
streak_milestones = {}
for milestone in [7, 30, 100, 365, 1000]:
count = conn.execute(
"SELECT COUNT(*) FROM users WHERE streak >= ?", (milestone,)
).fetchone()[0]
streak_milestones[f"{milestone}_days"] = count
conn.close()
result = {}
if streaks:
result["streak_stats"] = {
"count": len(streaks),
"median": statistics.median(streaks),
"mean": round(statistics.mean(streaks), 1),
"max": max(streaks),
"milestones": streak_milestones,
}
if xp_values:
result["xp_stats"] = {
"count": len(xp_values),
"median": statistics.median(xp_values),
"mean": round(statistics.mean(xp_values), 0),
"max": max(xp_values),
}
result["popular_courses"] = [
{"course": row[0], "users": row[1], "avg_xp": round(row[2] or 0, 0)}
for row in popular_courses
]
return result
Monitoring for API Changes
Duolingo's undocumented endpoints change without notice. Build monitoring to detect when they break:
import hashlib
EXPECTED_PROFILE_FIELDS = {
"streak", "totalXp", "currentCourseId", "courses", "creationDate"
}
def validate_api_response(data: dict, endpoint: str) -> tuple:
"""Validate that an API response has the expected structure."""
issues = []
if endpoint == "user_profile":
users = data.get("users", [])
if not users:
issues.append("no_users_in_response")
else:
user = users[0]
missing = EXPECTED_PROFILE_FIELDS - set(user.keys())
if missing:
issues.append(f"missing_fields: {missing}")
elif endpoint == "courses":
if not isinstance(data, list):
issues.append("not_a_list")
elif data and "learning_language" not in data[0]:
issues.append("course_format_changed")
return len(issues) == 0, issues
def check_endpoint_health() -> dict:
"""Quick health check of all Duolingo endpoints."""
results = {}
# Test courses endpoint
try:
resp = requests.get(f"{BASE_URL}/api/1/courses/list", headers=HEADERS, timeout=10)
if resp.status_code == 200:
data = resp.json()
valid, issues = validate_api_response(data, "courses")
results["courses"] = {"status": "ok" if valid else "degraded", "issues": issues}
else:
results["courses"] = {"status": "down", "http_code": resp.status_code}
except Exception as e:
results["courses"] = {"status": "error", "message": str(e)}
# Test user profile endpoint
try:
resp = requests.get(
f"{BASE_URL}/2017-06-30/users?username=LuisvonAhn&fields=streak,totalXp",
headers=HEADERS, timeout=10
)
if resp.status_code == 200:
data = resp.json()
valid, issues = validate_api_response(data, "user_profile")
results["user_profile"] = {"status": "ok" if valid else "degraded", "issues": issues}
else:
results["user_profile"] = {"status": "down", "http_code": resp.status_code}
except Exception as e:
results["user_profile"] = {"status": "error", "message": str(e)}
return results
# Run health check before any bulk scraping
health = check_endpoint_health()
for endpoint, status in health.items():
print(f" {endpoint}: {status['status']}")
if status.get("issues"):
print(f" Issues: {status['issues']}")
Proxy Configuration for Sustained Collection
For collecting data across many users, distribute requests to avoid Duolingo's per-IP rate limits:
def create_proxy_session(proxy_url: str) -> requests.Session:
"""Create a requests session with proxy and consistent headers."""
session = requests.Session()
session.proxies = {
"http": proxy_url,
"https": proxy_url,
}
session.headers.update({
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36",
"Accept": "application/json",
"Accept-Language": "en-US,en;q=0.9",
})
return session
# ThorData rotating residential proxy
# Use residential IPs to avoid Duolingo flagging datacenter traffic
PROXY_URL = "http://USERNAME:[email protected]:9000"
session = create_proxy_session(PROXY_URL)
# Each request through a rotating proxy appears from a different residential IP
for username in usernames[:100]:
profile = get_user_profile(username, session=session)
time.sleep(random.uniform(2, 4))
Complete Collection Pipeline
def run_duolingo_pipeline(
usernames: list = None,
db_path: str = "duolingo.db",
proxy_url: str = None,
):
"""
Full data collection pipeline:
1. Collect course statistics
2. Fetch user profiles (if usernames provided)
3. Store everything in SQLite
"""
conn = init_duolingo_db(db_path)
# Phase 1: Course statistics
print("Collecting course statistics...")
stats = get_course_statistics()
today = datetime.now().strftime("%Y-%m-%d")
# Save course learner counts
courses = get_duolingo_courses()
for course in courses:
conn.execute(
"""INSERT OR REPLACE INTO course_stats
(snapshot_date, learning_language, from_language, num_learners, phase)
VALUES (?,?,?,?,?)""",
(today, course.get("learning_language"),
course.get("from_language"),
course.get("num_learners"),
course.get("phase")),
)
conn.commit()
print(f" Saved stats for {len(courses)} courses")
# Phase 2: User profiles
if usernames:
session = create_proxy_session(proxy_url) if proxy_url else None
print(f"\nCollecting {len(usernames)} user profiles...")
for i, username in enumerate(usernames):
try:
profile = get_user_profile(username)
streak_info = get_streak_info(username)
if streak_info:
profile["longest_streak"] = streak_info.get("longest_streak")
save_user_profile(conn, profile)
if (i + 1) % 10 == 0:
print(f" Progress: {i+1}/{len(usernames)}")
except Exception as e:
print(f" Error ({username}): {e}")
time.sleep(random.uniform(2.0, 4.0))
conn.close()
print("\nPipeline complete")
return stats
# Run it
stats = run_duolingo_pipeline(proxy_url="http://USER:[email protected]:9000")
print(f"\nTop 5 courses by learner count:")
for course in stats.get("top_10_by_learners", [])[:5]:
print(f" {course['learning_language_name']}: {course['num_learners']:,} learners")
Endpoint Reference (2026)
| Endpoint | Auth | Rate Limit | Returns |
|---|---|---|---|
/api/1/courses/list |
No | ~60/min | All courses with learner counts |
/2017-06-30/users?username=X |
No | ~30/min | Profile, streak, XP, courses |
/2017-06-30/users?username=X&fields=streakData |
No | ~30/min | Extended streak history |
/login (POST) |
POST credentials | Strict | JWT session token |
/2017-06-30/users/{id}/courses/{id} |
Yes | ~30/min | Skill tree, lessons |
/leaderboards/.../users/{id} |
Yes | ~20/min | Leaderboard cohort |
The versioned 2017-06-30 endpoints have been stable since mid-2024, making them more reliable for production pipelines than the newer unversioned endpoints.
Legal and Ethical Notes
Duolingo's Terms of Service prohibit automated access. These endpoints are undocumented and Duolingo can change or block them at any time.
Keep use limited to research, personal projects, and analytics. Do not collect or store personal user data at scale. Do not use this to build competing products or interfere with Duolingo's services.
The course statistics (learner counts, language data) are published by Duolingo in marketing materials and press releases, so collecting them for research purposes is lower risk than collecting individual user data.