How to Scrape Wikipedia with Python: Action API & REST API (2026)
How to Scrape Wikipedia with Python: Action API & REST API (2026)
Wikipedia is one of the most generous sites on the internet when it comes to data access. Two fully documented APIs, no authentication required, and they explicitly encourage automated access.
But "easy to access" doesn't mean "easy to use well." The Action API has hundreds of parameters. The REST API returns different data structures depending on the endpoint. And if you're doing bulk extraction, you need to understand their rate limits and etiquette guidelines.
Here's a practical guide to both APIs with complete Python code for every major use case.
Setup
pip install httpx
No special libraries needed — Wikipedia's APIs are simple HTTP+JSON. One critical requirement: you must set a descriptive User-Agent header. Requests without one get 403'd by Wikipedia's servers.
import httpx
import time
import json
import re
from typing import Optional
WIKI_API = "https://en.wikipedia.org/w/api.php"
REST_API = "https://en.wikipedia.org/api/rest_v1"
# REQUIRED: Wikipedia needs a real User-Agent with contact info
# Replace with your actual bot name and email
SESSION = httpx.Client(
headers={"User-Agent": "MyResearchBot/1.0 ([email protected])"},
timeout=15,
)
The Action API: Full Power
The Action API at en.wikipedia.org/w/api.php is Wikipedia's workhorse. It supports querying, parsing, searching, and editing (with auth). For scraping purposes, query and parse are the main actions.
Fetching Article Content
def get_article_wikitext(title: str) -> str:
"""Get raw wikitext markup of an article."""
resp = SESSION.get(WIKI_API, params={
"action": "query",
"titles": title,
"prop": "revisions",
"rvprop": "content",
"rvslots": "main",
"format": "json",
"formatversion": "2",
})
data = resp.json()
pages = data["query"]["pages"]
if not pages:
return ""
page = pages[0]
if "missing" in page:
return ""
return page["revisions"][0]["slots"]["main"]["content"]
def get_article_html(title: str) -> str:
"""Get parsed HTML of an article."""
resp = SESSION.get(WIKI_API, params={
"action": "parse",
"page": title,
"prop": "text",
"format": "json",
"formatversion": "2",
})
return resp.json()["parse"]["text"]
def get_plain_text(title: str, sentences: int = None, intro_only: bool = False) -> str:
"""
Get plain text extract of an article.
sentences: limit to N sentences
intro_only: only the introduction paragraph
"""
params = {
"action": "query",
"titles": title,
"prop": "extracts",
"explaintext": True,
"format": "json",
"formatversion": "2",
}
if sentences:
params["exsentences"] = sentences
if intro_only:
params["exintro"] = True
resp = SESSION.get(WIKI_API, params=params)
data = resp.json()
pages = data["query"]["pages"]
if not pages:
return ""
page = pages[0]
if "missing" in page:
return ""
return page.get("extract", "")
# Examples
intro = get_plain_text("Python (programming language)", sentences=5)
print(intro)
full_text = get_plain_text("Machine learning")
print(f"Article length: {len(full_text)} characters")
Batching Multiple Articles
The Action API supports up to 50 titles per request — always use this to minimize API calls:
def get_multiple_extracts(titles: list, intro_only: bool = True) -> dict:
"""
Fetch extracts for multiple articles in one request.
Returns dict of {title: extract_text}
"""
results = {}
# Process in batches of 50 (API limit)
for i in range(0, len(titles), 50):
batch = titles[i:i+50]
resp = SESSION.get(WIKI_API, params={
"action": "query",
"titles": "|".join(batch),
"prop": "extracts",
"explaintext": True,
"exintro": intro_only,
"format": "json",
"formatversion": "2",
})
data = resp.json()
for page in data["query"]["pages"]:
if "missing" not in page:
results[page["title"]] = page.get("extract", "")
time.sleep(0.1) # small delay between batches
return results
# Fetch 100 articles efficiently (only 2 API calls)
titles = ["Python (programming language)", "JavaScript", "Rust (programming language)",
"Go (programming language)", "TypeScript"]
extracts = get_multiple_extracts(titles)
for title, text in extracts.items():
print(f"{title}: {len(text)} chars")
Search
The Action API has two search modes:
def full_text_search(query: str, limit: int = 20,
namespace: int = 0) -> list:
"""
Full-text search across article content.
namespace: 0=articles, 1=Talk, 4=Wikipedia, 6=File, 10=Template
"""
resp = SESSION.get(WIKI_API, params={
"action": "query",
"list": "search",
"srsearch": query,
"srlimit": limit,
"srnamespace": namespace,
"srprop": "snippet|titlesnippet|size|wordcount|timestamp",
"format": "json",
})
return resp.json()["query"]["search"]
def prefix_search(prefix: str, limit: int = 10) -> list:
"""Title autocomplete — articles starting with a prefix."""
resp = SESSION.get(WIKI_API, params={
"action": "query",
"list": "prefixsearch",
"pssearch": prefix,
"pslimit": limit,
"format": "json",
})
return resp.json()["query"]["prefixsearch"]
# Examples
results = full_text_search("neural networks transformer architecture", limit=10)
for r in results:
print(f"{r['title']} ({r['wordcount']} words) — {r['snippet'][:80]}...")
suggestions = prefix_search("Quantum comp", limit=5)
for s in suggestions:
print(f" {s['title']}")
Revision History
Every Wikipedia article has a complete edit history:
def get_revision_history(title: str, limit: int = 50,
start: str = None, end: str = None) -> list:
"""
Get edit history for an article.
start/end: ISO 8601 timestamps e.g. "2024-01-01T00:00:00Z"
"""
params = {
"action": "query",
"titles": title,
"prop": "revisions",
"rvprop": "ids|timestamp|user|comment|size|flags",
"rvlimit": limit,
"format": "json",
"formatversion": "2",
}
if start:
params["rvstart"] = start
if end:
params["rvend"] = end
revisions = []
while True:
resp = SESSION.get(WIKI_API, params=params)
data = resp.json()
pages = data["query"]["pages"]
if pages:
revisions.extend(pages[0].get("revisions", []))
if "continue" not in data or len(revisions) >= limit:
break
params["rvcontinue"] = data["continue"]["rvcontinue"]
return revisions[:limit]
def analyze_edit_activity(title: str) -> dict:
"""Analyze edit frequency and top editors for an article."""
revisions = get_revision_history(title, limit=500)
from collections import Counter
editors = Counter(r.get("user", "Anonymous") for r in revisions)
monthly = Counter()
for r in revisions:
ts = r.get("timestamp", "")
if ts:
month = ts[:7] # YYYY-MM
monthly[month] += 1
return {
"total_revisions_fetched": len(revisions),
"top_editors": editors.most_common(10),
"monthly_activity": dict(sorted(monthly.items())[-12:]), # last 12 months
}
# ChatGPT article edit analysis
stats = analyze_edit_activity("ChatGPT")
print(f"Total revisions: {stats['total_revisions_fetched']}")
print("Top editors:")
for editor, count in stats["top_editors"][:5]:
print(f" {editor}: {count} edits")
Category Crawling
Wikipedia's category system is a tree. Walk it to find all articles in a topic:
def get_category_members(category: str, limit: int = 100,
member_type: str = "page") -> list:
"""
Get pages/subcategories in a category.
member_type: "page", "subcat", "file"
"""
members = []
params = {
"action": "query",
"list": "categorymembers",
"cmtitle": f"Category:{category}",
"cmlimit": min(limit, 500),
"cmtype": member_type,
"cmprop": "ids|title|type|timestamp",
"format": "json",
}
while len(members) < limit:
resp = SESSION.get(WIKI_API, params=params)
data = resp.json()
batch = data["query"]["categorymembers"]
members.extend(batch)
if "continue" not in data or len(members) >= limit:
break
params["cmcontinue"] = data["continue"]["cmcontinue"]
time.sleep(0.1)
return members[:limit]
def crawl_category_tree(category: str, max_depth: int = 2,
max_articles: int = 1000) -> list:
"""
Recursively crawl a category tree.
Returns flat list of article titles across all subcategories.
"""
articles = []
visited_cats = set()
def _crawl(cat: str, depth: int):
nonlocal articles
if depth > max_depth or cat in visited_cats:
return
visited_cats.add(cat)
# Get direct articles in this category
pages = get_category_members(cat, limit=100, member_type="page")
for p in pages:
if p["title"] not in [a["title"] for a in articles]:
articles.append(p)
if len(articles) >= max_articles:
return
# Get subcategories and recurse
subcats = get_category_members(cat, limit=30, member_type="subcat")
for sc in subcats:
if len(articles) >= max_articles:
return
cat_name = sc["title"].replace("Category:", "")
time.sleep(0.2)
_crawl(cat_name, depth + 1)
_crawl(category, 0)
return articles[:max_articles]
# Get all articles in the Machine Learning category tree
articles = crawl_category_tree("Machine learning", max_depth=2, max_articles=500)
print(f"Found {len(articles)} articles in Machine Learning category tree")
for a in articles[:10]:
print(f" {a['title']}")
Backlinks and Links
def get_backlinks(title: str, limit: int = 100) -> list:
"""Get articles that link to this page (what links here)."""
resp = SESSION.get(WIKI_API, params={
"action": "query",
"list": "backlinks",
"bltitle": title,
"bllimit": limit,
"blnamespace": 0, # article namespace only
"format": "json",
})
return resp.json()["query"]["backlinks"]
def get_outgoing_links(title: str) -> list:
"""Get all links from an article to other Wikipedia articles."""
resp = SESSION.get(WIKI_API, params={
"action": "query",
"titles": title,
"prop": "links",
"pllimit": 500,
"plnamespace": 0,
"format": "json",
"formatversion": "2",
})
data = resp.json()
pages = data["query"]["pages"]
if not pages:
return []
return [link["title"] for link in pages[0].get("links", [])]
def get_article_metadata(title: str) -> dict:
"""Get categories, links, images, and coordinates in one request."""
resp = SESSION.get(WIKI_API, params={
"action": "query",
"titles": title,
"prop": "categories|links|images|coordinates|pageprops",
"cllimit": 50,
"pllimit": 100,
"imlimit": 50,
"format": "json",
"formatversion": "2",
})
data = resp.json()
pages = data["query"]["pages"]
if not pages:
return {}
page = pages[0]
return {
"title": page.get("title"),
"pageid": page.get("pageid"),
"categories": [c["title"].replace("Category:", "") for c in page.get("categories", [])],
"links": [l["title"] for l in page.get("links", [])],
"images": [i["title"] for i in page.get("images", [])],
"coordinates": page.get("coordinates", [{}])[0] if page.get("coordinates") else None,
"wikidata_id": page.get("pageprops", {}).get("wikibase_item"),
}
The REST API: Clean and Simple
Wikipedia's REST API at en.wikipedia.org/api/rest_v1/ offers cleaner endpoints for common operations:
def get_summary(title: str) -> dict:
"""Get article summary with thumbnail. Ideal for quick data extraction."""
resp = SESSION.get(f"{REST_API}/page/summary/{title}")
if resp.status_code == 404:
return {"error": "Article not found", "title": title}
resp.raise_for_status()
data = resp.json()
return {
"title": data["title"],
"displaytitle": data.get("displaytitle", data["title"]),
"extract": data["extract"],
"extract_html": data.get("extract_html", ""),
"description": data.get("description", ""),
"thumbnail": data.get("thumbnail", {}).get("source"),
"original_image": data.get("originalimage", {}).get("source"),
"url": data["content_urls"]["desktop"]["page"],
"wikidata_item": data.get("wikibase_item"),
"coordinates": data.get("coordinates"),
"last_modified": data.get("timestamp"),
}
def get_article_sections(title: str) -> list:
"""Get article sections with HTML content (mobile format)."""
resp = SESSION.get(f"{REST_API}/page/mobile-sections/{title}")
if resp.status_code == 404:
return []
resp.raise_for_status()
data = resp.json()
sections = []
# Lead section
lead = data.get("lead", {})
sections.append({
"heading": lead.get("normalizedtitle", title),
"level": 0,
"html": lead.get("sections", [{}])[0].get("text", "") if lead.get("sections") else "",
"id": "intro",
})
# Remaining sections
for s in data.get("remaining", {}).get("sections", []):
sections.append({
"heading": s.get("line", ""),
"level": s.get("toclevel", 1),
"html": s.get("text", ""),
"id": s.get("anchor", ""),
})
return sections
def get_related_articles(title: str) -> list:
"""Get Wikipedia's related article recommendations."""
resp = SESSION.get(f"{REST_API}/page/related/{title}")
if resp.status_code == 404:
return []
resp.raise_for_status()
data = resp.json()
return [
{
"title": p["title"],
"extract": p.get("extract", "")[:200],
"thumbnail": p.get("thumbnail", {}).get("source"),
}
for p in data.get("pages", [])
]
def get_page_media(title: str) -> list:
"""Get all media files associated with an article."""
resp = SESSION.get(f"{REST_API}/page/media-list/{title}")
if resp.status_code == 404:
return []
resp.raise_for_status()
data = resp.json()
media = []
for item in data.get("items", []):
if item.get("type") == "image":
src_set = item.get("srcset", [])
largest = max(src_set, key=lambda x: x.get("scale", "1x")) if src_set else {}
media.append({
"title": item.get("title"),
"type": item.get("type"),
"caption": item.get("caption", {}).get("text", ""),
"url": "https:" + largest.get("src", "") if largest.get("src") else None,
})
return media
# Examples
summary = get_summary("Python_(programming_language)")
print(f"{summary['title']}")
print(f"Description: {summary['description']}")
print(f"Extract: {summary['extract'][:300]}...")
print(f"Thumbnail: {summary['thumbnail']}")
Async Bulk Fetching
For large-scale extraction, use async to parallelize requests:
import asyncio
import httpx
SEM = asyncio.Semaphore(10) # max 10 concurrent requests
async def fetch_summary_async(client: httpx.AsyncClient, title: str) -> dict:
async with SEM:
try:
resp = await client.get(f"{REST_API}/page/summary/{title}")
if resp.status_code == 404:
return {"title": title, "error": "not found"}
resp.raise_for_status()
data = resp.json()
return {
"title": data["title"],
"extract": data.get("extract", ""),
"description": data.get("description", ""),
"thumbnail": data.get("thumbnail", {}).get("source"),
}
except Exception as e:
return {"title": title, "error": str(e)}
async def bulk_fetch_summaries(titles: list) -> list:
async with httpx.AsyncClient(
headers={"User-Agent": "MyResearchBot/1.0 ([email protected])"},
timeout=20,
) as client:
tasks = [fetch_summary_async(client, t) for t in titles]
return await asyncio.gather(*tasks)
# Fetch 100 summaries efficiently
programming_languages = [
"Python (programming language)", "JavaScript", "Rust (programming language)",
"Go (programming language)", "TypeScript", "Kotlin (programming language)",
"Swift (programming language)", "Ruby (programming language)",
"C (programming language)", "C++",
]
results = asyncio.run(bulk_fetch_summaries(programming_languages))
for r in results:
if "error" not in r:
print(f"{r['title']}: {r['extract'][:100]}...")
Rate Limits, Etiquette, and Bulk Alternatives
Wikipedia's API guidelines ask for:
- A descriptive User-Agent with contact info (this is enforced — missing UA = 403)
- No more than ~200 requests/second sustained (you realistically won't hit this with Python)
- Add maxlag=5 parameter on Action API to back off when Wikipedia's servers are under load
# Always include maxlag for respectful bulk scraping
params = {
"action": "query",
"titles": title,
"prop": "extracts",
"maxlag": 5, # Back off if servers are laggy
"format": "json",
}
For bulk extraction of millions of articles — use database dumps instead:
Wikipedia releases full database dumps every 2-4 weeks at dumps.wikimedia.org. These are XML files containing the full text of every article. For any project that needs the whole encyclopedia, parsing the dump is faster and more respectful than making millions of API calls.
# Parse Wikipedia XML dump (for large-scale extraction)
import xml.etree.ElementTree as ET
import bz2
def parse_wiki_dump(dump_path: str, max_articles: int = 1000):
"""
Parse a Wikipedia XML dump file.
dump_path: path to enwiki-*-pages-articles.xml.bz2
"""
articles = []
NS = "http://www.mediawiki.org/xml/dtd/export-0.10/"
with bz2.open(dump_path, "rb") as f:
for event, elem in ET.iterparse(f, events=["end"]):
if elem.tag == f"{{{NS}}}page":
title_el = elem.find(f"{{{NS}}}title")
text_el = elem.find(f".//{{{NS}}}text")
if title_el is not None and text_el is not None:
title = title_el.text
text = text_el.text or ""
# Skip redirects and non-article pages
if not text.startswith("#REDIRECT") and ":" not in title:
articles.append({"title": title, "wikitext": text[:5000]})
if len(articles) >= max_articles:
break
elem.clear() # Free memory
return articles
For projects where you need to make high volumes of API requests simultaneously — such as cross-referencing thousands of article summaries — routing through a proxy service like ThorData distributes requests across multiple IPs and prevents any single IP from triggering Wikipedia's automated abuse detection systems.
Useful Complete Examples
Build a topic knowledge graph:
def build_knowledge_graph(seed_topic: str, depth: int = 2) -> dict:
"""Build a graph of related articles starting from a seed topic."""
graph = {"nodes": [], "edges": []}
visited = set()
def explore(title: str, current_depth: int):
if current_depth > depth or title in visited:
return
visited.add(title)
summary = get_summary(title)
if "error" in summary:
return
graph["nodes"].append({
"id": title,
"description": summary.get("description", ""),
"wikidata": summary.get("wikidata_item"),
})
if current_depth < depth:
links = get_outgoing_links(title)[:10] # limit to 10 links per article
for link in links:
graph["edges"].append({"from": title, "to": link})
time.sleep(0.2)
explore(link, current_depth + 1)
explore(seed_topic, 0)
return graph
graph = build_knowledge_graph("Artificial intelligence", depth=1)
print(f"Nodes: {len(graph['nodes'])}, Edges: {len(graph['edges'])}")
Summary
Wikipedia is the benchmark for API-friendly data access. Key takeaways:
- Always set User-Agent — it's required, not optional
- Use REST API for summaries — it's simpler and returns clean, structured data
- Use Action API for search, revisions, categories — more powerful for complex queries
- Batch requests — use pipe-separated titles (
"Python|JavaScript|Rust") for up to 50 articles at once - Use async for bulk work —
httpx.AsyncClientwith a semaphore is much faster than sequential - For millions of articles — use dumps — don't API-scrape the whole encyclopedia; download the dump
- Add
maxlag=5— be a respectful bot that backs off under server load
The Action API is flexible but complex — you'll spend time in the parameter documentation. The REST API is simpler but covers fewer use cases. For most scraping tasks, start with the REST API for summaries and basic content, use the Action API for search, revisions, and category crawling, and use database dumps if you need the whole encyclopedia.