How to Scrape Wikipedia with Python: Action API & REST API (2026)

2026-04-09 [python scraping wikipedia api]

How to Scrape Wikipedia with Python: Action API & REST API (2026)

Wikipedia is one of the most generous sites on the internet when it comes to data access. Two fully documented APIs, no authentication required, and they explicitly encourage automated access.

But "easy to access" doesn't mean "easy to use well." The Action API has hundreds of parameters. The REST API returns different data structures depending on the endpoint. And if you're doing bulk extraction, you need to understand their rate limits and etiquette guidelines.

Here's a practical guide to both APIs with complete Python code for every major use case.

Setup

pip install httpx

No special libraries needed — Wikipedia's APIs are simple HTTP+JSON. One critical requirement: you must set a descriptive User-Agent header. Requests without one get 403'd by Wikipedia's servers.

import httpx
import time
import json
import re
from typing import Optional

WIKI_API = "https://en.wikipedia.org/w/api.php"
REST_API = "https://en.wikipedia.org/api/rest_v1"

# REQUIRED: Wikipedia needs a real User-Agent with contact info
# Replace with your actual bot name and email
SESSION = httpx.Client(
    headers={"User-Agent": "MyResearchBot/1.0 ([email protected])"},
    timeout=15,
)

The Action API: Full Power

The Action API at en.wikipedia.org/w/api.php is Wikipedia's workhorse. It supports querying, parsing, searching, and editing (with auth). For scraping purposes, query and parse are the main actions.

Fetching Article Content

def get_article_wikitext(title: str) -> str:
    """Get raw wikitext markup of an article."""
    resp = SESSION.get(WIKI_API, params={
        "action": "query",
        "titles": title,
        "prop": "revisions",
        "rvprop": "content",
        "rvslots": "main",
        "format": "json",
        "formatversion": "2",
    })
    data = resp.json()
    pages = data["query"]["pages"]
    if not pages:
        return ""
    page = pages[0]
    if "missing" in page:
        return ""
    return page["revisions"][0]["slots"]["main"]["content"]


def get_article_html(title: str) -> str:
    """Get parsed HTML of an article."""
    resp = SESSION.get(WIKI_API, params={
        "action": "parse",
        "page": title,
        "prop": "text",
        "format": "json",
        "formatversion": "2",
    })
    return resp.json()["parse"]["text"]


def get_plain_text(title: str, sentences: int = None, intro_only: bool = False) -> str:
    """
    Get plain text extract of an article.
    sentences: limit to N sentences
    intro_only: only the introduction paragraph
    """
    params = {
        "action": "query",
        "titles": title,
        "prop": "extracts",
        "explaintext": True,
        "format": "json",
        "formatversion": "2",
    }
    if sentences:
        params["exsentences"] = sentences
    if intro_only:
        params["exintro"] = True

    resp = SESSION.get(WIKI_API, params=params)
    data = resp.json()
    pages = data["query"]["pages"]
    if not pages:
        return ""
    page = pages[0]
    if "missing" in page:
        return ""
    return page.get("extract", "")


# Examples
intro = get_plain_text("Python (programming language)", sentences=5)
print(intro)

full_text = get_plain_text("Machine learning")
print(f"Article length: {len(full_text)} characters")

Batching Multiple Articles

The Action API supports up to 50 titles per request — always use this to minimize API calls:

def get_multiple_extracts(titles: list, intro_only: bool = True) -> dict:
    """
    Fetch extracts for multiple articles in one request.
    Returns dict of {title: extract_text}
    """
    results = {}
    # Process in batches of 50 (API limit)
    for i in range(0, len(titles), 50):
        batch = titles[i:i+50]
        resp = SESSION.get(WIKI_API, params={
            "action": "query",
            "titles": "|".join(batch),
            "prop": "extracts",
            "explaintext": True,
            "exintro": intro_only,
            "format": "json",
            "formatversion": "2",
        })
        data = resp.json()
        for page in data["query"]["pages"]:
            if "missing" not in page:
                results[page["title"]] = page.get("extract", "")
        time.sleep(0.1)  # small delay between batches

    return results


# Fetch 100 articles efficiently (only 2 API calls)
titles = ["Python (programming language)", "JavaScript", "Rust (programming language)",
          "Go (programming language)", "TypeScript"]
extracts = get_multiple_extracts(titles)
for title, text in extracts.items():
    print(f"{title}: {len(text)} chars")

Search

The Action API has two search modes:

def full_text_search(query: str, limit: int = 20,
                     namespace: int = 0) -> list:
    """
    Full-text search across article content.
    namespace: 0=articles, 1=Talk, 4=Wikipedia, 6=File, 10=Template
    """
    resp = SESSION.get(WIKI_API, params={
        "action": "query",
        "list": "search",
        "srsearch": query,
        "srlimit": limit,
        "srnamespace": namespace,
        "srprop": "snippet|titlesnippet|size|wordcount|timestamp",
        "format": "json",
    })
    return resp.json()["query"]["search"]


def prefix_search(prefix: str, limit: int = 10) -> list:
    """Title autocomplete — articles starting with a prefix."""
    resp = SESSION.get(WIKI_API, params={
        "action": "query",
        "list": "prefixsearch",
        "pssearch": prefix,
        "pslimit": limit,
        "format": "json",
    })
    return resp.json()["query"]["prefixsearch"]


# Examples
results = full_text_search("neural networks transformer architecture", limit=10)
for r in results:
    print(f"{r['title']} ({r['wordcount']} words) — {r['snippet'][:80]}...")

suggestions = prefix_search("Quantum comp", limit=5)
for s in suggestions:
    print(f"  {s['title']}")

Revision History

Every Wikipedia article has a complete edit history:

def get_revision_history(title: str, limit: int = 50,
                          start: str = None, end: str = None) -> list:
    """
    Get edit history for an article.
    start/end: ISO 8601 timestamps e.g. "2024-01-01T00:00:00Z"
    """
    params = {
        "action": "query",
        "titles": title,
        "prop": "revisions",
        "rvprop": "ids|timestamp|user|comment|size|flags",
        "rvlimit": limit,
        "format": "json",
        "formatversion": "2",
    }
    if start:
        params["rvstart"] = start
    if end:
        params["rvend"] = end

    revisions = []
    while True:
        resp = SESSION.get(WIKI_API, params=params)
        data = resp.json()
        pages = data["query"]["pages"]
        if pages:
            revisions.extend(pages[0].get("revisions", []))

        if "continue" not in data or len(revisions) >= limit:
            break
        params["rvcontinue"] = data["continue"]["rvcontinue"]

    return revisions[:limit]


def analyze_edit_activity(title: str) -> dict:
    """Analyze edit frequency and top editors for an article."""
    revisions = get_revision_history(title, limit=500)
    from collections import Counter

    editors = Counter(r.get("user", "Anonymous") for r in revisions)
    monthly = Counter()
    for r in revisions:
        ts = r.get("timestamp", "")
        if ts:
            month = ts[:7]  # YYYY-MM
            monthly[month] += 1

    return {
        "total_revisions_fetched": len(revisions),
        "top_editors": editors.most_common(10),
        "monthly_activity": dict(sorted(monthly.items())[-12:]),  # last 12 months
    }


# ChatGPT article edit analysis
stats = analyze_edit_activity("ChatGPT")
print(f"Total revisions: {stats['total_revisions_fetched']}")
print("Top editors:")
for editor, count in stats["top_editors"][:5]:
    print(f"  {editor}: {count} edits")

Category Crawling

Wikipedia's category system is a tree. Walk it to find all articles in a topic:

def get_category_members(category: str, limit: int = 100,
                          member_type: str = "page") -> list:
    """
    Get pages/subcategories in a category.
    member_type: "page", "subcat", "file"
    """
    members = []
    params = {
        "action": "query",
        "list": "categorymembers",
        "cmtitle": f"Category:{category}",
        "cmlimit": min(limit, 500),
        "cmtype": member_type,
        "cmprop": "ids|title|type|timestamp",
        "format": "json",
    }

    while len(members) < limit:
        resp = SESSION.get(WIKI_API, params=params)
        data = resp.json()
        batch = data["query"]["categorymembers"]
        members.extend(batch)

        if "continue" not in data or len(members) >= limit:
            break
        params["cmcontinue"] = data["continue"]["cmcontinue"]
        time.sleep(0.1)

    return members[:limit]


def crawl_category_tree(category: str, max_depth: int = 2,
                         max_articles: int = 1000) -> list:
    """
    Recursively crawl a category tree.
    Returns flat list of article titles across all subcategories.
    """
    articles = []
    visited_cats = set()

    def _crawl(cat: str, depth: int):
        nonlocal articles
        if depth > max_depth or cat in visited_cats:
            return
        visited_cats.add(cat)

        # Get direct articles in this category
        pages = get_category_members(cat, limit=100, member_type="page")
        for p in pages:
            if p["title"] not in [a["title"] for a in articles]:
                articles.append(p)
        if len(articles) >= max_articles:
            return

        # Get subcategories and recurse
        subcats = get_category_members(cat, limit=30, member_type="subcat")
        for sc in subcats:
            if len(articles) >= max_articles:
                return
            cat_name = sc["title"].replace("Category:", "")
            time.sleep(0.2)
            _crawl(cat_name, depth + 1)

    _crawl(category, 0)
    return articles[:max_articles]


# Get all articles in the Machine Learning category tree
articles = crawl_category_tree("Machine learning", max_depth=2, max_articles=500)
print(f"Found {len(articles)} articles in Machine Learning category tree")
for a in articles[:10]:
    print(f"  {a['title']}")

Backlinks and Links

def get_backlinks(title: str, limit: int = 100) -> list:
    """Get articles that link to this page (what links here)."""
    resp = SESSION.get(WIKI_API, params={
        "action": "query",
        "list": "backlinks",
        "bltitle": title,
        "bllimit": limit,
        "blnamespace": 0,  # article namespace only
        "format": "json",
    })
    return resp.json()["query"]["backlinks"]


def get_outgoing_links(title: str) -> list:
    """Get all links from an article to other Wikipedia articles."""
    resp = SESSION.get(WIKI_API, params={
        "action": "query",
        "titles": title,
        "prop": "links",
        "pllimit": 500,
        "plnamespace": 0,
        "format": "json",
        "formatversion": "2",
    })
    data = resp.json()
    pages = data["query"]["pages"]
    if not pages:
        return []
    return [link["title"] for link in pages[0].get("links", [])]


def get_article_metadata(title: str) -> dict:
    """Get categories, links, images, and coordinates in one request."""
    resp = SESSION.get(WIKI_API, params={
        "action": "query",
        "titles": title,
        "prop": "categories|links|images|coordinates|pageprops",
        "cllimit": 50,
        "pllimit": 100,
        "imlimit": 50,
        "format": "json",
        "formatversion": "2",
    })
    data = resp.json()
    pages = data["query"]["pages"]
    if not pages:
        return {}

    page = pages[0]
    return {
        "title": page.get("title"),
        "pageid": page.get("pageid"),
        "categories": [c["title"].replace("Category:", "") for c in page.get("categories", [])],
        "links": [l["title"] for l in page.get("links", [])],
        "images": [i["title"] for i in page.get("images", [])],
        "coordinates": page.get("coordinates", [{}])[0] if page.get("coordinates") else None,
        "wikidata_id": page.get("pageprops", {}).get("wikibase_item"),
    }

The REST API: Clean and Simple

Wikipedia's REST API at en.wikipedia.org/api/rest_v1/ offers cleaner endpoints for common operations:

def get_summary(title: str) -> dict:
    """Get article summary with thumbnail. Ideal for quick data extraction."""
    resp = SESSION.get(f"{REST_API}/page/summary/{title}")
    if resp.status_code == 404:
        return {"error": "Article not found", "title": title}
    resp.raise_for_status()
    data = resp.json()
    return {
        "title": data["title"],
        "displaytitle": data.get("displaytitle", data["title"]),
        "extract": data["extract"],
        "extract_html": data.get("extract_html", ""),
        "description": data.get("description", ""),
        "thumbnail": data.get("thumbnail", {}).get("source"),
        "original_image": data.get("originalimage", {}).get("source"),
        "url": data["content_urls"]["desktop"]["page"],
        "wikidata_item": data.get("wikibase_item"),
        "coordinates": data.get("coordinates"),
        "last_modified": data.get("timestamp"),
    }


def get_article_sections(title: str) -> list:
    """Get article sections with HTML content (mobile format)."""
    resp = SESSION.get(f"{REST_API}/page/mobile-sections/{title}")
    if resp.status_code == 404:
        return []
    resp.raise_for_status()
    data = resp.json()

    sections = []
    # Lead section
    lead = data.get("lead", {})
    sections.append({
        "heading": lead.get("normalizedtitle", title),
        "level": 0,
        "html": lead.get("sections", [{}])[0].get("text", "") if lead.get("sections") else "",
        "id": "intro",
    })

    # Remaining sections
    for s in data.get("remaining", {}).get("sections", []):
        sections.append({
            "heading": s.get("line", ""),
            "level": s.get("toclevel", 1),
            "html": s.get("text", ""),
            "id": s.get("anchor", ""),
        })

    return sections


def get_related_articles(title: str) -> list:
    """Get Wikipedia's related article recommendations."""
    resp = SESSION.get(f"{REST_API}/page/related/{title}")
    if resp.status_code == 404:
        return []
    resp.raise_for_status()
    data = resp.json()
    return [
        {
            "title": p["title"],
            "extract": p.get("extract", "")[:200],
            "thumbnail": p.get("thumbnail", {}).get("source"),
        }
        for p in data.get("pages", [])
    ]


def get_page_media(title: str) -> list:
    """Get all media files associated with an article."""
    resp = SESSION.get(f"{REST_API}/page/media-list/{title}")
    if resp.status_code == 404:
        return []
    resp.raise_for_status()
    data = resp.json()
    media = []
    for item in data.get("items", []):
        if item.get("type") == "image":
            src_set = item.get("srcset", [])
            largest = max(src_set, key=lambda x: x.get("scale", "1x")) if src_set else {}
            media.append({
                "title": item.get("title"),
                "type": item.get("type"),
                "caption": item.get("caption", {}).get("text", ""),
                "url": "https:" + largest.get("src", "") if largest.get("src") else None,
            })
    return media


# Examples
summary = get_summary("Python_(programming_language)")
print(f"{summary['title']}")
print(f"Description: {summary['description']}")
print(f"Extract: {summary['extract'][:300]}...")
print(f"Thumbnail: {summary['thumbnail']}")

Async Bulk Fetching

For large-scale extraction, use async to parallelize requests:

import asyncio
import httpx

SEM = asyncio.Semaphore(10)  # max 10 concurrent requests

async def fetch_summary_async(client: httpx.AsyncClient, title: str) -> dict:
    async with SEM:
        try:
            resp = await client.get(f"{REST_API}/page/summary/{title}")
            if resp.status_code == 404:
                return {"title": title, "error": "not found"}
            resp.raise_for_status()
            data = resp.json()
            return {
                "title": data["title"],
                "extract": data.get("extract", ""),
                "description": data.get("description", ""),
                "thumbnail": data.get("thumbnail", {}).get("source"),
            }
        except Exception as e:
            return {"title": title, "error": str(e)}


async def bulk_fetch_summaries(titles: list) -> list:
    async with httpx.AsyncClient(
        headers={"User-Agent": "MyResearchBot/1.0 ([email protected])"},
        timeout=20,
    ) as client:
        tasks = [fetch_summary_async(client, t) for t in titles]
        return await asyncio.gather(*tasks)


# Fetch 100 summaries efficiently
programming_languages = [
    "Python (programming language)", "JavaScript", "Rust (programming language)",
    "Go (programming language)", "TypeScript", "Kotlin (programming language)",
    "Swift (programming language)", "Ruby (programming language)",
    "C (programming language)", "C++",
]
results = asyncio.run(bulk_fetch_summaries(programming_languages))
for r in results:
    if "error" not in r:
        print(f"{r['title']}: {r['extract'][:100]}...")

Rate Limits, Etiquette, and Bulk Alternatives

Wikipedia's API guidelines ask for: - A descriptive User-Agent with contact info (this is enforced — missing UA = 403) - No more than ~200 requests/second sustained (you realistically won't hit this with Python) - Add maxlag=5 parameter on Action API to back off when Wikipedia's servers are under load

# Always include maxlag for respectful bulk scraping
params = {
    "action": "query",
    "titles": title,
    "prop": "extracts",
    "maxlag": 5,  # Back off if servers are laggy
    "format": "json",
}

For bulk extraction of millions of articles — use database dumps instead:

Wikipedia releases full database dumps every 2-4 weeks at dumps.wikimedia.org. These are XML files containing the full text of every article. For any project that needs the whole encyclopedia, parsing the dump is faster and more respectful than making millions of API calls.

# Parse Wikipedia XML dump (for large-scale extraction)
import xml.etree.ElementTree as ET
import bz2

def parse_wiki_dump(dump_path: str, max_articles: int = 1000):
    """
    Parse a Wikipedia XML dump file.
    dump_path: path to enwiki-*-pages-articles.xml.bz2
    """
    articles = []
    NS = "http://www.mediawiki.org/xml/dtd/export-0.10/"

    with bz2.open(dump_path, "rb") as f:
        for event, elem in ET.iterparse(f, events=["end"]):
            if elem.tag == f"{{{NS}}}page":
                title_el = elem.find(f"{{{NS}}}title")
                text_el = elem.find(f".//{{{NS}}}text")

                if title_el is not None and text_el is not None:
                    title = title_el.text
                    text = text_el.text or ""

                    # Skip redirects and non-article pages
                    if not text.startswith("#REDIRECT") and ":" not in title:
                        articles.append({"title": title, "wikitext": text[:5000]})

                        if len(articles) >= max_articles:
                            break

                elem.clear()  # Free memory

    return articles

For projects where you need to make high volumes of API requests simultaneously — such as cross-referencing thousands of article summaries — routing through a proxy service like ThorData distributes requests across multiple IPs and prevents any single IP from triggering Wikipedia's automated abuse detection systems.

Useful Complete Examples

Build a topic knowledge graph:

def build_knowledge_graph(seed_topic: str, depth: int = 2) -> dict:
    """Build a graph of related articles starting from a seed topic."""
    graph = {"nodes": [], "edges": []}
    visited = set()

    def explore(title: str, current_depth: int):
        if current_depth > depth or title in visited:
            return
        visited.add(title)

        summary = get_summary(title)
        if "error" in summary:
            return

        graph["nodes"].append({
            "id": title,
            "description": summary.get("description", ""),
            "wikidata": summary.get("wikidata_item"),
        })

        if current_depth < depth:
            links = get_outgoing_links(title)[:10]  # limit to 10 links per article
            for link in links:
                graph["edges"].append({"from": title, "to": link})
                time.sleep(0.2)
                explore(link, current_depth + 1)

    explore(seed_topic, 0)
    return graph


graph = build_knowledge_graph("Artificial intelligence", depth=1)
print(f"Nodes: {len(graph['nodes'])}, Edges: {len(graph['edges'])}")

Summary

Wikipedia is the benchmark for API-friendly data access. Key takeaways:

Always set User-Agent — it's required, not optional
Use REST API for summaries — it's simpler and returns clean, structured data
Use Action API for search, revisions, categories — more powerful for complex queries
Batch requests — use pipe-separated titles ("Python|JavaScript|Rust") for up to 50 articles at once
Use async for bulk work — httpx.AsyncClient with a semaphore is much faster than sequential
For millions of articles — use dumps — don't API-scrape the whole encyclopedia; download the dump
Add maxlag=5 — be a respectful bot that backs off under server load

The Action API is flexible but complex — you'll spend time in the parameter documentation. The REST API is simpler but covers fewer use cases. For most scraping tasks, start with the REST API for summaries and basic content, use the Action API for search, revisions, and category crawling, and use database dumps if you need the whole encyclopedia.