How to Scrape PubMed Research Papers with Python (2026)

2026-04-09 [python scraping pubmed research api ncbi]

How to Scrape PubMed Research Papers with Python (2026)

PubMed indexes over 36 million biomedical and life science articles. If you're building a literature review pipeline, tracking citation networks, or doing meta-analysis, you need programmatic access to this data.

The good news: NCBI (National Center for Biotechnology Information) provides Entrez, a well-documented API suite specifically designed for this. Unlike most scraping targets, PubMed wants you to access their data programmatically — they just want you to do it through their API rather than hammering their HTML pages.

Here's how to build a proper PubMed data pipeline.

Setup and API Key

Register for an NCBI API key at ncbi.nlm.nih.gov/account. Without a key, you're limited to 3 requests per second. With a key, you get 10 per second. For any serious work, get the key.

import httpx
import xml.etree.ElementTree as ET
import time
import json
import sqlite3
from typing import Optional
from datetime import datetime

NCBI_KEY = "YOUR_NCBI_API_KEY"
BASE = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils"

client = httpx.Client(timeout=30)

def entrez_params(**kwargs) -> dict:
    """Add API key and tool identification to all Entrez requests."""
    params = {
        "api_key": NCBI_KEY,
        "tool": "research_pipeline",
        "email": "[email protected]",
    }
    params.update(kwargs)
    return params

Searching for Articles

ESearch finds article IDs matching your query. PubMed uses a powerful query syntax — you can search by title, author, journal, date range, MeSH terms, and more.

def search_pubmed(
    query: str,
    max_results: int = 100,
    sort: str = "relevance",
    date_range: tuple[str, str] | None = None,
) -> list[str]:
    """Search PubMed and return list of PMIDs.

    sort options: relevance, pub_date, Author, JournalName
    date_range: ("2024/01/01", "2026/12/31") for mindate/maxdate
    """
    pmids = []
    batch_size = min(max_results, 500)

    extra = {}
    if date_range:
        extra["mindate"] = date_range[0]
        extra["maxdate"] = date_range[1]
        extra["datetype"] = "pdat"

    for retstart in range(0, max_results, batch_size):
        resp = client.get(f"{BASE}/esearch.fcgi", params=entrez_params(
            db="pubmed",
            term=query,
            retmax=batch_size,
            retstart=retstart,
            retmode="json",
            sort=sort,
            **extra,
        ))
        resp.raise_for_status()
        data = resp.json()
        result = data.get("esearchresult", {})
        batch = result.get("idlist", [])
        if not batch:
            break
        pmids.extend(batch)
        total = int(result.get("count", 0))
        print(f"Found {total} total; fetched {len(pmids)} so far")
        time.sleep(0.15)

    return pmids[:max_results]


# Example: recent CRISPR papers
pmids = search_pubmed(
    "CRISPR gene editing 2025:2026[dp]",
    max_results=200,
    sort="pub_date",
)
print(f"Found {len(pmids)} CRISPR articles")

Query Syntax Reference

PubMed's query language is powerful once you learn the field tags:

Query	What it searches
`cancer[ti]`	Title only
`Smith J[au]`	Author
`Nature[jour]`	Journal name
`2025:2026[dp]`	Date range (publication year)
`"machine learning"[MeSH]`	MeSH controlled vocabulary
`review[pt]`	Publication type (review, clinical trial, meta-analysis)
`free full text[filter]`	Only open-access articles
`hasabstract`	Only articles with abstracts
`"United States"[pl]`	Place of publication
`English[la]`	Language filter

Combine with AND, OR, NOT: "deep learning"[ti] AND radiology[MeSH] AND 2025:2026[dp] AND hasabstract

Boolean precedence: NOT > AND > OR. Use parentheses to be explicit:

(("machine learning" OR "deep learning")[ti]) AND (cancer OR tumor)[MeSH] AND 2025:2026[dp]

Fetching Article Metadata

EFetch retrieves full metadata for a list of PMIDs. XML format gives you the most detail:

def fetch_articles(pmids: list[str]) -> list[dict]:
    """Fetch full metadata for a list of PMIDs."""
    articles = []

    # Process in batches of 200 (Entrez maximum)
    for i in range(0, len(pmids), 200):
        batch = pmids[i:i + 200]
        resp = client.get(f"{BASE}/efetch.fcgi", params=entrez_params(
            db="pubmed",
            id=",".join(batch),
            rettype="xml",
            retmode="xml",
        ))
        resp.raise_for_status()

        root = ET.fromstring(resp.text)
        for article_elem in root.findall(".//PubmedArticle"):
            articles.append(parse_article(article_elem))

        print(f"Parsed {len(articles)} articles total")
        time.sleep(0.15)

    return articles


def parse_article(elem) -> dict:
    """Parse a PubmedArticle XML element into a clean dict."""
    medline = elem.find(".//MedlineCitation")
    article = medline.find(".//Article")

    # Title — itertext handles bold/italic tags within titles
    title_elem = article.find(".//ArticleTitle")
    title = "".join(title_elem.itertext()) if title_elem is not None else ""

    # Abstract — may have multiple labeled sections (Background, Methods, etc.)
    abstract_parts = []
    for ab in article.findall(".//AbstractText"):
        label = ab.get("Label", "")
        text = "".join(ab.itertext())
        if label:
            abstract_parts.append(f"{label}: {text}")
        else:
            abstract_parts.append(text)

    # Authors with affiliations
    authors = []
    affiliations = []
    for au in article.findall(".//Author"):
        last = au.findtext("LastName", "")
        fore = au.findtext("ForeName", "")
        initials = au.findtext("Initials", "")
        orcid = ""
        for ident in au.findall("Identifier"):
            if ident.get("Source") == "ORCID":
                orcid = ident.text or ""
        if last:
            authors.append({
                "name": f"{last} {fore}".strip(),
                "initials": f"{last} {initials}".strip(),
                "orcid": orcid,
            })
        for aff in au.findall(".//Affiliation"):
            if aff.text:
                affiliations.append(aff.text)

    # Journal info
    journal = article.findtext(".//Journal/Title", "")
    issn = article.findtext(".//Journal/ISSN", "")
    volume = article.findtext(".//Journal/JournalIssue/Volume", "")
    issue = article.findtext(".//Journal/JournalIssue/Issue", "")

    # Publication date
    pub_date = article.find(".//PubDate")
    year = month = day = ""
    if pub_date is not None:
        year = pub_date.findtext("Year", "")
        month = pub_date.findtext("Month", "")
        day = pub_date.findtext("Day", "")
        # Handle MedlineDate format ("2025 Jan-Feb")
        if not year:
            medline_date = pub_date.findtext("MedlineDate", "")
            if medline_date:
                year = medline_date[:4]

    # PMID and DOI
    pmid = medline.findtext(".//PMID", "")
    doi = ""
    pmc_id = ""
    for id_elem in article.findall(".//ELocationID"):
        if id_elem.get("EIdType") == "doi":
            doi = id_elem.text or ""
    for id_elem in medline.findall(".//ArticleId"):
        if id_elem.get("IdType") == "pmc":
            pmc_id = id_elem.text or ""

    # MeSH terms (controlled vocabulary)
    mesh_terms = []
    for mh in medline.findall(".//MeshHeading"):
        descriptor = mh.findtext("DescriptorName", "")
        qualifiers = [q.text for q in mh.findall("QualifierName") if q.text]
        if descriptor:
            if qualifiers:
                mesh_terms.append(f"{descriptor}/{'/'.join(qualifiers)}")
            else:
                mesh_terms.append(descriptor)

    # Keywords
    keywords = [
        kw.text for kw in medline.findall(".//Keyword") if kw.text
    ]

    # Publication types
    pub_types = [
        pt.text for pt in article.findall(".//PublicationType") if pt.text
    ]

    return {
        "pmid": pmid,
        "pmc_id": pmc_id,
        "title": title,
        "abstract": " ".join(abstract_parts),
        "authors": [a["name"] for a in authors],
        "author_details": authors,
        "affiliations": list(set(affiliations)),
        "journal": journal,
        "issn": issn,
        "volume": volume,
        "issue": issue,
        "year": year,
        "month": month,
        "day": day,
        "doi": doi,
        "mesh_terms": mesh_terms,
        "keywords": keywords,
        "pub_types": pub_types,
    }

Getting Citation Counts

PubMed Central's cited-by data is available through ELink:

def get_citation_count(pmid: str) -> int:
    """Get number of articles in PubMed that cite this PMID."""
    resp = client.get(f"{BASE}/elink.fcgi", params=entrez_params(
        db="pubmed",
        dbfrom="pubmed",
        id=pmid,
        linkname="pubmed_pubmed_citedin",
        retmode="json",
    ))
    resp.raise_for_status()
    data = resp.json()

    linksets = data.get("linksets", [{}])
    links = linksets[0].get("linksetdbs", [])
    if links:
        return len(links[0].get("links", []))
    return 0


def get_related_articles(pmid: str, max_related: int = 10) -> list[str]:
    """Get PMIDs of related articles (similarity-based)."""
    resp = client.get(f"{BASE}/elink.fcgi", params=entrez_params(
        db="pubmed",
        dbfrom="pubmed",
        id=pmid,
        linkname="pubmed_pubmed",
        retmode="json",
    ))
    resp.raise_for_status()
    data = resp.json()

    linksets = data.get("linksets", [{}])
    links = linksets[0].get("linksetdbs", [])
    for link_db in links:
        if link_db.get("linkname") == "pubmed_pubmed":
            related = link_db.get("links", [])
            return [str(l) for l in related[:max_related] if str(l) != pmid]
    return []


# Example: get citations for a set of papers
for pmid in pmids[:10]:
    count = get_citation_count(pmid)
    print(f"PMID {pmid}: {count} citations")
    time.sleep(0.15)

Note that NCBI's citation data is less comprehensive than Google Scholar or Semantic Scholar — it only counts citations from other PubMed-indexed articles. For complete citation networks, query the Semantic Scholar API as a supplement.

Semantic Scholar Citation Enrichment

The Semantic Scholar API is free, no API key required for light use, and has broader citation data:

def get_semantic_scholar_citations(doi: str) -> dict:
    """Get citation count from Semantic Scholar (more complete than PubMed)."""
    if not doi:
        return {}

    url = f"https://api.semanticscholar.org/graph/v1/paper/DOI:{doi}"
    params = {"fields": "citationCount,influentialCitationCount,referenceCount,year"}

    resp = httpx.get(url, params=params, timeout=15)
    if resp.status_code == 404:
        return {}
    resp.raise_for_status()
    return resp.json()


# Enrich articles with Semantic Scholar citation counts
def enrich_with_citations(articles: list[dict]) -> list[dict]:
    """Add Semantic Scholar citation counts to article list."""
    for article in articles:
        if article.get("doi"):
            ss_data = get_semantic_scholar_citations(article["doi"])
            article["citation_count_ss"] = ss_data.get("citationCount", 0)
            article["influential_citations"] = ss_data.get("influentialCitationCount", 0)
            time.sleep(0.5)  # Semantic Scholar rate limit
    return articles

Bulk Downloading for Research Pipelines

For large-scale literature mining, build a SQLite database:

def init_paper_db(db_path: str) -> sqlite3.Connection:
    conn = sqlite3.connect(db_path)
    conn.executescript("""
        CREATE TABLE IF NOT EXISTS papers (
            pmid            TEXT PRIMARY KEY,
            pmc_id          TEXT,
            title           TEXT,
            abstract        TEXT,
            authors         TEXT,
            affiliations    TEXT,
            journal         TEXT,
            issn            TEXT,
            volume          TEXT,
            issue           TEXT,
            year            TEXT,
            month           TEXT,
            doi             TEXT,
            mesh_terms      TEXT,
            keywords        TEXT,
            pub_types       TEXT,
            citation_count  INTEGER DEFAULT 0,
            full_text_url   TEXT,
            fetched_at      TEXT DEFAULT CURRENT_TIMESTAMP
        );

        CREATE INDEX IF NOT EXISTS idx_year ON papers(year);
        CREATE INDEX IF NOT EXISTS idx_journal ON papers(journal);
        CREATE VIRTUAL TABLE IF NOT EXISTS papers_fts USING fts5(
            pmid UNINDEXED,
            title,
            abstract,
            content=papers,
            content_rowid=rowid
        );
    """)
    conn.commit()
    return conn


def build_paper_database(
    query: str,
    db_path: str,
    max_papers: int = 5000,
    proxy_url: str | None = None,
):
    """Build a local SQLite database of PubMed articles.

    proxy_url: optional proxy for high-throughput pipelines
    """
    global client
    if proxy_url:
        client = httpx.Client(timeout=30, proxy=proxy_url)

    db = init_paper_db(db_path)
    now = datetime.utcnow().isoformat()

    # Get all PMIDs
    pmids = search_pubmed(query, max_results=max_papers)
    print(f"Found {len(pmids)} PMIDs to process")

    # Skip already-fetched PMIDs
    existing = set(
        row[0] for row in db.execute("SELECT pmid FROM papers").fetchall()
    )
    new_pmids = [p for p in pmids if p not in existing]
    print(f"New PMIDs to fetch: {len(new_pmids)} ({len(existing)} already in DB)")

    # Fetch and store in batches
    articles = fetch_articles(new_pmids)

    for art in articles:
        db.execute(
            """INSERT OR IGNORE INTO papers
               (pmid, pmc_id, title, abstract, authors, affiliations, journal, issn,
                volume, issue, year, month, doi, mesh_terms, keywords, pub_types, fetched_at)
               VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)""",
            (
                art["pmid"], art.get("pmc_id"), art["title"], art["abstract"],
                json.dumps(art["authors"]), json.dumps(art.get("affiliations", [])),
                art["journal"], art.get("issn"), art.get("volume"), art.get("issue"),
                art["year"], art.get("month"), art["doi"],
                json.dumps(art["mesh_terms"]), json.dumps(art.get("keywords", [])),
                json.dumps(art.get("pub_types", [])), now
            )
        )

    db.commit()
    total = db.execute("SELECT COUNT(*) FROM papers").fetchone()[0]
    print(f"Database now contains {total} papers")
    db.close()


# Build AI drug discovery database
build_paper_database(
    '"artificial intelligence" AND "drug discovery" AND 2024:2026[dp]',
    "ai_drug_discovery.db",
    max_papers=2000,
)

Full-Text Search with FTS5

Once you have a local database, FTS5 enables full-text search across your corpus:

def search_local_corpus(db_path: str, query: str, limit: int = 20) -> list[dict]:
    """Full-text search across local paper database."""
    conn = sqlite3.connect(db_path)
    conn.row_factory = sqlite3.Row

    # Rebuild FTS index if needed
    conn.execute("INSERT INTO papers_fts(papers_fts) VALUES('rebuild')")

    results = conn.execute("""
        SELECT p.pmid, p.title, p.journal, p.year, p.doi,
               snippet(papers_fts, 1, '<b>', '</b>', '...', 20) as snippet
        FROM papers_fts
        JOIN papers p ON papers_fts.pmid = p.pmid
        WHERE papers_fts MATCH ?
        ORDER BY rank
        LIMIT ?
    """, (query, limit)).fetchall()

    conn.close()
    return [dict(r) for r in results]


# Find papers mentioning a specific protein
hits = search_local_corpus("ai_drug_discovery.db", "transformer attention protein folding")
for hit in hits:
    print(f"[{hit['year']}] {hit['title'][:80]}...")
    print(f"  {hit['journal']} | PMID: {hit['pmid']}")
    print(f"  ...{hit['snippet']}...")
    print()

Analyzing Research Trends

With a populated database, you can analyze publication trends over time:

def analyze_trends(db_path: str, keyword: str) -> dict:
    """Analyze how often a keyword appears in titles/abstracts by year."""
    conn = sqlite3.connect(db_path)

    results = conn.execute("""
        SELECT year, COUNT(*) as count
        FROM papers
        WHERE (title LIKE ? OR abstract LIKE ?)
          AND year != ''
          AND CAST(year AS INTEGER) >= 2015
        GROUP BY year
        ORDER BY year
    """, (f"%{keyword}%", f"%{keyword}%")).fetchall()

    conn.close()
    trend = {str(row[0]): row[1] for row in results}
    return trend


def get_top_journals(db_path: str, limit: int = 20) -> list[tuple]:
    """Get journals with most papers in the database."""
    conn = sqlite3.connect(db_path)
    results = conn.execute("""
        SELECT journal, COUNT(*) as count
        FROM papers
        WHERE journal != ''
        GROUP BY journal
        ORDER BY count DESC
        LIMIT ?
    """, (limit,)).fetchall()
    conn.close()
    return results


def get_author_network(db_path: str, min_papers: int = 3) -> dict:
    """Find prolific authors in the dataset."""
    conn = sqlite3.connect(db_path)
    rows = conn.execute("SELECT pmid, authors FROM papers WHERE authors != '[]'").fetchall()
    conn.close()

    author_counts = {}
    for pmid, authors_json in rows:
        try:
            authors = json.loads(authors_json)
            for author in authors:
                author_counts[author] = author_counts.get(author, 0) + 1
        except json.JSONDecodeError:
            continue

    return {
        author: count
        for author, count in sorted(author_counts.items(), key=lambda x: -x[1])
        if count >= min_papers
    }

Rate Limits and Best Practices

NCBI is remarkably generous compared to commercial APIs, but they enforce limits:

Without API key: 3 requests/second, risk of temporary IP bans
With API key: 10 requests/second
Bulk downloads: For datasets exceeding 10,000 articles, use NCBI's FTP bulk download service instead (ftp.ncbi.nlm.nih.gov/pubmed/baseline/)

For running large-scale pipelines that hit the API continuously, routing through a proxy distributes the request load. ThorData residential proxies work well for this — they let you distribute requests across IPs so a temporary rate limit on one IP doesn't stall your entire pipeline:

# For high-throughput pipelines
proxy_client = httpx.Client(
    timeout=30,
    proxy="http://USER:[email protected]:9000",
)

# Replace the global client for proxy usage
client = proxy_client

Always set tool and email parameters in your requests — NCBI tracks tool usage and this reduces throttling risk:

params = entrez_params(
    db="pubmed",
    term=query,
    tool="my_research_pipeline",
    email="[email protected]",
)

Open Access Full Text

For papers with open-access full text, you can get the content through PMC:

def get_full_text_url(pmid: str) -> Optional[str]:
    """Get PMC full-text link if available."""
    resp = client.get(f"{BASE}/elink.fcgi", params=entrez_params(
        db="pmc",
        dbfrom="pubmed",
        id=pmid,
        retmode="json",
    ))
    data = resp.json()
    linksets = data.get("linksets", [{}])
    links = linksets[0].get("linksetdbs", [])
    for link_db in links:
        if link_db.get("dbto") == "pmc":
            pmc_ids = link_db.get("links", [])
            if pmc_ids:
                return f"https://www.ncbi.nlm.nih.gov/pmc/articles/PMC{pmc_ids[0]}/"
    return None


def download_pmc_fulltext(pmc_url: str) -> str:
    """Download full text from PMC article page."""
    from selectolax.parser import HTMLParser

    resp = httpx.get(pmc_url, timeout=30, follow_redirects=True)
    resp.raise_for_status()

    tree = HTMLParser(resp.text)

    # PMC article body is in #mc-article-content or .article-content
    body = tree.css_first("#mc-article-content, .article-content, #article-content")
    if not body:
        return ""

    # Remove figure captions, references, supplementary
    for el in body.css("figure, .fig, .table-wrap, #references, .ref-list"):
        el.decompose()

    return body.text(separator="\n", strip=True)

FTP Bulk Download for Massive Datasets

For datasets exceeding tens of thousands of papers, the NCBI FTP service is more appropriate than the API:

import ftplib
import gzip
import os

def download_pubmed_baseline(output_dir: str, max_files: int = 5):
    """
    Download PubMed annual baseline files from NCBI FTP.
    Each file contains ~30,000 articles in XML format.
    Full baseline has ~1,200 files = ~36 million articles.
    """
    os.makedirs(output_dir, exist_ok=True)

    ftp = ftplib.FTP("ftp.ncbi.nlm.nih.gov")
    ftp.login()
    ftp.cwd("/pubmed/baseline/")

    files = [f for f in ftp.nlst() if f.endswith(".xml.gz")]
    print(f"Found {len(files)} baseline files")

    for i, filename in enumerate(files[:max_files]):
        local_path = os.path.join(output_dir, filename)
        if os.path.exists(local_path):
            print(f"  {filename} already downloaded, skipping")
            continue

        print(f"  Downloading {filename} ({i+1}/{min(max_files, len(files))})")
        with open(local_path, "wb") as f:
            ftp.retrbinary(f"RETR {filename}", f.write)

    ftp.quit()
    return output_dir


def parse_baseline_file(gz_path: str) -> list[dict]:
    """Parse a compressed PubMed baseline XML file."""
    articles = []
    with gzip.open(gz_path, "rb") as f:
        content = f.read()

    root = ET.fromstring(content)
    for article_elem in root.findall(".//PubmedArticle"):
        try:
            articles.append(parse_article(article_elem))
        except Exception:
            continue  # Skip malformed entries

    return articles

PubMed is one of the best-behaved scraping targets you'll encounter. The API is comprehensive, the documentation is excellent, and NCBI actively supports programmatic access. Use the tools they provide — the Entrez API, FTP bulk downloads, and the PMC full-text service — and you'll have a robust research pipeline that handles millions of articles without any anti-bot friction.