How to Scrape PubMed Research Papers with Python (2026)
How to Scrape PubMed Research Papers with Python (2026)
PubMed indexes over 36 million biomedical and life science articles. If you're building a literature review pipeline, tracking citation networks, or doing meta-analysis, you need programmatic access to this data.
The good news: NCBI (National Center for Biotechnology Information) provides Entrez, a well-documented API suite specifically designed for this. Unlike most scraping targets, PubMed wants you to access their data programmatically — they just want you to do it through their API rather than hammering their HTML pages.
Here's how to build a proper PubMed data pipeline.
Setup and API Key
Register for an NCBI API key at ncbi.nlm.nih.gov/account. Without a key, you're limited to 3 requests per second. With a key, you get 10 per second. For any serious work, get the key.
import httpx
import xml.etree.ElementTree as ET
import time
import json
import sqlite3
from typing import Optional
from datetime import datetime
NCBI_KEY = "YOUR_NCBI_API_KEY"
BASE = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils"
client = httpx.Client(timeout=30)
def entrez_params(**kwargs) -> dict:
"""Add API key and tool identification to all Entrez requests."""
params = {
"api_key": NCBI_KEY,
"tool": "research_pipeline",
"email": "[email protected]",
}
params.update(kwargs)
return params
Searching for Articles
ESearch finds article IDs matching your query. PubMed uses a powerful query syntax — you can search by title, author, journal, date range, MeSH terms, and more.
def search_pubmed(
query: str,
max_results: int = 100,
sort: str = "relevance",
date_range: tuple[str, str] | None = None,
) -> list[str]:
"""Search PubMed and return list of PMIDs.
sort options: relevance, pub_date, Author, JournalName
date_range: ("2024/01/01", "2026/12/31") for mindate/maxdate
"""
pmids = []
batch_size = min(max_results, 500)
extra = {}
if date_range:
extra["mindate"] = date_range[0]
extra["maxdate"] = date_range[1]
extra["datetype"] = "pdat"
for retstart in range(0, max_results, batch_size):
resp = client.get(f"{BASE}/esearch.fcgi", params=entrez_params(
db="pubmed",
term=query,
retmax=batch_size,
retstart=retstart,
retmode="json",
sort=sort,
**extra,
))
resp.raise_for_status()
data = resp.json()
result = data.get("esearchresult", {})
batch = result.get("idlist", [])
if not batch:
break
pmids.extend(batch)
total = int(result.get("count", 0))
print(f"Found {total} total; fetched {len(pmids)} so far")
time.sleep(0.15)
return pmids[:max_results]
# Example: recent CRISPR papers
pmids = search_pubmed(
"CRISPR gene editing 2025:2026[dp]",
max_results=200,
sort="pub_date",
)
print(f"Found {len(pmids)} CRISPR articles")
Query Syntax Reference
PubMed's query language is powerful once you learn the field tags:
| Query | What it searches |
|---|---|
cancer[ti] |
Title only |
Smith J[au] |
Author |
Nature[jour] |
Journal name |
2025:2026[dp] |
Date range (publication year) |
"machine learning"[MeSH] |
MeSH controlled vocabulary |
review[pt] |
Publication type (review, clinical trial, meta-analysis) |
free full text[filter] |
Only open-access articles |
hasabstract |
Only articles with abstracts |
"United States"[pl] |
Place of publication |
English[la] |
Language filter |
Combine with AND, OR, NOT: "deep learning"[ti] AND radiology[MeSH] AND 2025:2026[dp] AND hasabstract
Boolean precedence: NOT > AND > OR. Use parentheses to be explicit:
(("machine learning" OR "deep learning")[ti]) AND (cancer OR tumor)[MeSH] AND 2025:2026[dp]
Fetching Article Metadata
EFetch retrieves full metadata for a list of PMIDs. XML format gives you the most detail:
def fetch_articles(pmids: list[str]) -> list[dict]:
"""Fetch full metadata for a list of PMIDs."""
articles = []
# Process in batches of 200 (Entrez maximum)
for i in range(0, len(pmids), 200):
batch = pmids[i:i + 200]
resp = client.get(f"{BASE}/efetch.fcgi", params=entrez_params(
db="pubmed",
id=",".join(batch),
rettype="xml",
retmode="xml",
))
resp.raise_for_status()
root = ET.fromstring(resp.text)
for article_elem in root.findall(".//PubmedArticle"):
articles.append(parse_article(article_elem))
print(f"Parsed {len(articles)} articles total")
time.sleep(0.15)
return articles
def parse_article(elem) -> dict:
"""Parse a PubmedArticle XML element into a clean dict."""
medline = elem.find(".//MedlineCitation")
article = medline.find(".//Article")
# Title — itertext handles bold/italic tags within titles
title_elem = article.find(".//ArticleTitle")
title = "".join(title_elem.itertext()) if title_elem is not None else ""
# Abstract — may have multiple labeled sections (Background, Methods, etc.)
abstract_parts = []
for ab in article.findall(".//AbstractText"):
label = ab.get("Label", "")
text = "".join(ab.itertext())
if label:
abstract_parts.append(f"{label}: {text}")
else:
abstract_parts.append(text)
# Authors with affiliations
authors = []
affiliations = []
for au in article.findall(".//Author"):
last = au.findtext("LastName", "")
fore = au.findtext("ForeName", "")
initials = au.findtext("Initials", "")
orcid = ""
for ident in au.findall("Identifier"):
if ident.get("Source") == "ORCID":
orcid = ident.text or ""
if last:
authors.append({
"name": f"{last} {fore}".strip(),
"initials": f"{last} {initials}".strip(),
"orcid": orcid,
})
for aff in au.findall(".//Affiliation"):
if aff.text:
affiliations.append(aff.text)
# Journal info
journal = article.findtext(".//Journal/Title", "")
issn = article.findtext(".//Journal/ISSN", "")
volume = article.findtext(".//Journal/JournalIssue/Volume", "")
issue = article.findtext(".//Journal/JournalIssue/Issue", "")
# Publication date
pub_date = article.find(".//PubDate")
year = month = day = ""
if pub_date is not None:
year = pub_date.findtext("Year", "")
month = pub_date.findtext("Month", "")
day = pub_date.findtext("Day", "")
# Handle MedlineDate format ("2025 Jan-Feb")
if not year:
medline_date = pub_date.findtext("MedlineDate", "")
if medline_date:
year = medline_date[:4]
# PMID and DOI
pmid = medline.findtext(".//PMID", "")
doi = ""
pmc_id = ""
for id_elem in article.findall(".//ELocationID"):
if id_elem.get("EIdType") == "doi":
doi = id_elem.text or ""
for id_elem in medline.findall(".//ArticleId"):
if id_elem.get("IdType") == "pmc":
pmc_id = id_elem.text or ""
# MeSH terms (controlled vocabulary)
mesh_terms = []
for mh in medline.findall(".//MeshHeading"):
descriptor = mh.findtext("DescriptorName", "")
qualifiers = [q.text for q in mh.findall("QualifierName") if q.text]
if descriptor:
if qualifiers:
mesh_terms.append(f"{descriptor}/{'/'.join(qualifiers)}")
else:
mesh_terms.append(descriptor)
# Keywords
keywords = [
kw.text for kw in medline.findall(".//Keyword") if kw.text
]
# Publication types
pub_types = [
pt.text for pt in article.findall(".//PublicationType") if pt.text
]
return {
"pmid": pmid,
"pmc_id": pmc_id,
"title": title,
"abstract": " ".join(abstract_parts),
"authors": [a["name"] for a in authors],
"author_details": authors,
"affiliations": list(set(affiliations)),
"journal": journal,
"issn": issn,
"volume": volume,
"issue": issue,
"year": year,
"month": month,
"day": day,
"doi": doi,
"mesh_terms": mesh_terms,
"keywords": keywords,
"pub_types": pub_types,
}
Getting Citation Counts
PubMed Central's cited-by data is available through ELink:
def get_citation_count(pmid: str) -> int:
"""Get number of articles in PubMed that cite this PMID."""
resp = client.get(f"{BASE}/elink.fcgi", params=entrez_params(
db="pubmed",
dbfrom="pubmed",
id=pmid,
linkname="pubmed_pubmed_citedin",
retmode="json",
))
resp.raise_for_status()
data = resp.json()
linksets = data.get("linksets", [{}])
links = linksets[0].get("linksetdbs", [])
if links:
return len(links[0].get("links", []))
return 0
def get_related_articles(pmid: str, max_related: int = 10) -> list[str]:
"""Get PMIDs of related articles (similarity-based)."""
resp = client.get(f"{BASE}/elink.fcgi", params=entrez_params(
db="pubmed",
dbfrom="pubmed",
id=pmid,
linkname="pubmed_pubmed",
retmode="json",
))
resp.raise_for_status()
data = resp.json()
linksets = data.get("linksets", [{}])
links = linksets[0].get("linksetdbs", [])
for link_db in links:
if link_db.get("linkname") == "pubmed_pubmed":
related = link_db.get("links", [])
return [str(l) for l in related[:max_related] if str(l) != pmid]
return []
# Example: get citations for a set of papers
for pmid in pmids[:10]:
count = get_citation_count(pmid)
print(f"PMID {pmid}: {count} citations")
time.sleep(0.15)
Note that NCBI's citation data is less comprehensive than Google Scholar or Semantic Scholar — it only counts citations from other PubMed-indexed articles. For complete citation networks, query the Semantic Scholar API as a supplement.
Semantic Scholar Citation Enrichment
The Semantic Scholar API is free, no API key required for light use, and has broader citation data:
def get_semantic_scholar_citations(doi: str) -> dict:
"""Get citation count from Semantic Scholar (more complete than PubMed)."""
if not doi:
return {}
url = f"https://api.semanticscholar.org/graph/v1/paper/DOI:{doi}"
params = {"fields": "citationCount,influentialCitationCount,referenceCount,year"}
resp = httpx.get(url, params=params, timeout=15)
if resp.status_code == 404:
return {}
resp.raise_for_status()
return resp.json()
# Enrich articles with Semantic Scholar citation counts
def enrich_with_citations(articles: list[dict]) -> list[dict]:
"""Add Semantic Scholar citation counts to article list."""
for article in articles:
if article.get("doi"):
ss_data = get_semantic_scholar_citations(article["doi"])
article["citation_count_ss"] = ss_data.get("citationCount", 0)
article["influential_citations"] = ss_data.get("influentialCitationCount", 0)
time.sleep(0.5) # Semantic Scholar rate limit
return articles
Bulk Downloading for Research Pipelines
For large-scale literature mining, build a SQLite database:
def init_paper_db(db_path: str) -> sqlite3.Connection:
conn = sqlite3.connect(db_path)
conn.executescript("""
CREATE TABLE IF NOT EXISTS papers (
pmid TEXT PRIMARY KEY,
pmc_id TEXT,
title TEXT,
abstract TEXT,
authors TEXT,
affiliations TEXT,
journal TEXT,
issn TEXT,
volume TEXT,
issue TEXT,
year TEXT,
month TEXT,
doi TEXT,
mesh_terms TEXT,
keywords TEXT,
pub_types TEXT,
citation_count INTEGER DEFAULT 0,
full_text_url TEXT,
fetched_at TEXT DEFAULT CURRENT_TIMESTAMP
);
CREATE INDEX IF NOT EXISTS idx_year ON papers(year);
CREATE INDEX IF NOT EXISTS idx_journal ON papers(journal);
CREATE VIRTUAL TABLE IF NOT EXISTS papers_fts USING fts5(
pmid UNINDEXED,
title,
abstract,
content=papers,
content_rowid=rowid
);
""")
conn.commit()
return conn
def build_paper_database(
query: str,
db_path: str,
max_papers: int = 5000,
proxy_url: str | None = None,
):
"""Build a local SQLite database of PubMed articles.
proxy_url: optional proxy for high-throughput pipelines
"""
global client
if proxy_url:
client = httpx.Client(timeout=30, proxy=proxy_url)
db = init_paper_db(db_path)
now = datetime.utcnow().isoformat()
# Get all PMIDs
pmids = search_pubmed(query, max_results=max_papers)
print(f"Found {len(pmids)} PMIDs to process")
# Skip already-fetched PMIDs
existing = set(
row[0] for row in db.execute("SELECT pmid FROM papers").fetchall()
)
new_pmids = [p for p in pmids if p not in existing]
print(f"New PMIDs to fetch: {len(new_pmids)} ({len(existing)} already in DB)")
# Fetch and store in batches
articles = fetch_articles(new_pmids)
for art in articles:
db.execute(
"""INSERT OR IGNORE INTO papers
(pmid, pmc_id, title, abstract, authors, affiliations, journal, issn,
volume, issue, year, month, doi, mesh_terms, keywords, pub_types, fetched_at)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)""",
(
art["pmid"], art.get("pmc_id"), art["title"], art["abstract"],
json.dumps(art["authors"]), json.dumps(art.get("affiliations", [])),
art["journal"], art.get("issn"), art.get("volume"), art.get("issue"),
art["year"], art.get("month"), art["doi"],
json.dumps(art["mesh_terms"]), json.dumps(art.get("keywords", [])),
json.dumps(art.get("pub_types", [])), now
)
)
db.commit()
total = db.execute("SELECT COUNT(*) FROM papers").fetchone()[0]
print(f"Database now contains {total} papers")
db.close()
# Build AI drug discovery database
build_paper_database(
'"artificial intelligence" AND "drug discovery" AND 2024:2026[dp]',
"ai_drug_discovery.db",
max_papers=2000,
)
Full-Text Search with FTS5
Once you have a local database, FTS5 enables full-text search across your corpus:
def search_local_corpus(db_path: str, query: str, limit: int = 20) -> list[dict]:
"""Full-text search across local paper database."""
conn = sqlite3.connect(db_path)
conn.row_factory = sqlite3.Row
# Rebuild FTS index if needed
conn.execute("INSERT INTO papers_fts(papers_fts) VALUES('rebuild')")
results = conn.execute("""
SELECT p.pmid, p.title, p.journal, p.year, p.doi,
snippet(papers_fts, 1, '<b>', '</b>', '...', 20) as snippet
FROM papers_fts
JOIN papers p ON papers_fts.pmid = p.pmid
WHERE papers_fts MATCH ?
ORDER BY rank
LIMIT ?
""", (query, limit)).fetchall()
conn.close()
return [dict(r) for r in results]
# Find papers mentioning a specific protein
hits = search_local_corpus("ai_drug_discovery.db", "transformer attention protein folding")
for hit in hits:
print(f"[{hit['year']}] {hit['title'][:80]}...")
print(f" {hit['journal']} | PMID: {hit['pmid']}")
print(f" ...{hit['snippet']}...")
print()
Analyzing Research Trends
With a populated database, you can analyze publication trends over time:
def analyze_trends(db_path: str, keyword: str) -> dict:
"""Analyze how often a keyword appears in titles/abstracts by year."""
conn = sqlite3.connect(db_path)
results = conn.execute("""
SELECT year, COUNT(*) as count
FROM papers
WHERE (title LIKE ? OR abstract LIKE ?)
AND year != ''
AND CAST(year AS INTEGER) >= 2015
GROUP BY year
ORDER BY year
""", (f"%{keyword}%", f"%{keyword}%")).fetchall()
conn.close()
trend = {str(row[0]): row[1] for row in results}
return trend
def get_top_journals(db_path: str, limit: int = 20) -> list[tuple]:
"""Get journals with most papers in the database."""
conn = sqlite3.connect(db_path)
results = conn.execute("""
SELECT journal, COUNT(*) as count
FROM papers
WHERE journal != ''
GROUP BY journal
ORDER BY count DESC
LIMIT ?
""", (limit,)).fetchall()
conn.close()
return results
def get_author_network(db_path: str, min_papers: int = 3) -> dict:
"""Find prolific authors in the dataset."""
conn = sqlite3.connect(db_path)
rows = conn.execute("SELECT pmid, authors FROM papers WHERE authors != '[]'").fetchall()
conn.close()
author_counts = {}
for pmid, authors_json in rows:
try:
authors = json.loads(authors_json)
for author in authors:
author_counts[author] = author_counts.get(author, 0) + 1
except json.JSONDecodeError:
continue
return {
author: count
for author, count in sorted(author_counts.items(), key=lambda x: -x[1])
if count >= min_papers
}
Rate Limits and Best Practices
NCBI is remarkably generous compared to commercial APIs, but they enforce limits:
- Without API key: 3 requests/second, risk of temporary IP bans
- With API key: 10 requests/second
- Bulk downloads: For datasets exceeding 10,000 articles, use NCBI's FTP bulk download service instead (
ftp.ncbi.nlm.nih.gov/pubmed/baseline/)
For running large-scale pipelines that hit the API continuously, routing through a proxy distributes the request load. ThorData residential proxies work well for this — they let you distribute requests across IPs so a temporary rate limit on one IP doesn't stall your entire pipeline:
# For high-throughput pipelines
proxy_client = httpx.Client(
timeout=30,
proxy="http://USER:[email protected]:9000",
)
# Replace the global client for proxy usage
client = proxy_client
Always set tool and email parameters in your requests — NCBI tracks tool usage and this reduces throttling risk:
params = entrez_params(
db="pubmed",
term=query,
tool="my_research_pipeline",
email="[email protected]",
)
Open Access Full Text
For papers with open-access full text, you can get the content through PMC:
def get_full_text_url(pmid: str) -> Optional[str]:
"""Get PMC full-text link if available."""
resp = client.get(f"{BASE}/elink.fcgi", params=entrez_params(
db="pmc",
dbfrom="pubmed",
id=pmid,
retmode="json",
))
data = resp.json()
linksets = data.get("linksets", [{}])
links = linksets[0].get("linksetdbs", [])
for link_db in links:
if link_db.get("dbto") == "pmc":
pmc_ids = link_db.get("links", [])
if pmc_ids:
return f"https://www.ncbi.nlm.nih.gov/pmc/articles/PMC{pmc_ids[0]}/"
return None
def download_pmc_fulltext(pmc_url: str) -> str:
"""Download full text from PMC article page."""
from selectolax.parser import HTMLParser
resp = httpx.get(pmc_url, timeout=30, follow_redirects=True)
resp.raise_for_status()
tree = HTMLParser(resp.text)
# PMC article body is in #mc-article-content or .article-content
body = tree.css_first("#mc-article-content, .article-content, #article-content")
if not body:
return ""
# Remove figure captions, references, supplementary
for el in body.css("figure, .fig, .table-wrap, #references, .ref-list"):
el.decompose()
return body.text(separator="\n", strip=True)
FTP Bulk Download for Massive Datasets
For datasets exceeding tens of thousands of papers, the NCBI FTP service is more appropriate than the API:
import ftplib
import gzip
import os
def download_pubmed_baseline(output_dir: str, max_files: int = 5):
"""
Download PubMed annual baseline files from NCBI FTP.
Each file contains ~30,000 articles in XML format.
Full baseline has ~1,200 files = ~36 million articles.
"""
os.makedirs(output_dir, exist_ok=True)
ftp = ftplib.FTP("ftp.ncbi.nlm.nih.gov")
ftp.login()
ftp.cwd("/pubmed/baseline/")
files = [f for f in ftp.nlst() if f.endswith(".xml.gz")]
print(f"Found {len(files)} baseline files")
for i, filename in enumerate(files[:max_files]):
local_path = os.path.join(output_dir, filename)
if os.path.exists(local_path):
print(f" {filename} already downloaded, skipping")
continue
print(f" Downloading {filename} ({i+1}/{min(max_files, len(files))})")
with open(local_path, "wb") as f:
ftp.retrbinary(f"RETR {filename}", f.write)
ftp.quit()
return output_dir
def parse_baseline_file(gz_path: str) -> list[dict]:
"""Parse a compressed PubMed baseline XML file."""
articles = []
with gzip.open(gz_path, "rb") as f:
content = f.read()
root = ET.fromstring(content)
for article_elem in root.findall(".//PubmedArticle"):
try:
articles.append(parse_article(article_elem))
except Exception:
continue # Skip malformed entries
return articles
PubMed is one of the best-behaved scraping targets you'll encounter. The API is comprehensive, the documentation is excellent, and NCBI actively supports programmatic access. Use the tools they provide — the Entrez API, FTP bulk downloads, and the PMC full-text service — and you'll have a robust research pipeline that handles millions of articles without any anti-bot friction.