How to Scrape npm Package Data with Python (2026 Guide)
How to Scrape npm Package Data with Python (2026 Guide)
The npm registry is a CouchDB instance behind a CDN. Every package published to npm has a JSON document you can fetch directly — no authentication, no API key, no scraping needed for basic metadata. It is one of the most developer-friendly data sources on the web.
The registry URL is https://registry.npmjs.org. Hit it with a package name and you get everything: versions, maintainers, dependencies, publish dates, readme content, and more.
This guide covers everything from basic package lookups to building full npm ecosystem datasets at scale, including proxy rotation strategies for high-volume collection.
Environment Setup
pip install requests httpx aiohttp aiofiles tqdm
For async collection of large datasets:
pip install asyncio aiohttp
Fetching Package Metadata
The simplest possible fetch:
import requests
def get_package_info(package_name):
"""
Fetch complete package metadata from the npm registry.
No authentication required. Returns the full CouchDB document.
"""
resp = requests.get(
f"https://registry.npmjs.org/{package_name}",
timeout=15,
)
resp.raise_for_status()
return resp.json()
data = get_package_info("express")
print(f"Name: {data['name']}")
print(f"Description: {data.get('description', 'N/A')}")
print(f"Latest version: {data['dist-tags']['latest']}")
print(f"Total versions: {len(data['versions'])}")
print(f"License: {data.get('license', 'N/A')}")
print(f"Homepage: {data.get('homepage', 'N/A')}")
print(f"Repository: {data.get('repository', {}).get('url', 'N/A')}")
The full document for popular packages can be massive — Express is over 2MB, React is over 15MB. If you only need the latest version, use the abbreviated metadata endpoint:
def get_package_abbreviated(package_name):
"""
Fetch abbreviated metadata — much smaller response.
Returns only what npm needs for installation: versions, dependencies, dist info.
"""
resp = requests.get(
f"https://registry.npmjs.org/{package_name}",
headers={"Accept": "application/vnd.npm.install-v1+json"},
timeout=15,
)
resp.raise_for_status()
return resp.json()
def get_latest_version(package_name):
"""Fetch only the latest version manifest — the smallest possible response."""
resp = requests.get(f"https://registry.npmjs.org/{package_name}/latest", timeout=15)
resp.raise_for_status()
return resp.json()
Version History and Release Timeline
Every published version is in the versions object. The time field has ISO 8601 timestamps for every version publish:
import requests
from datetime import datetime
def get_version_timeline(package_name):
"""
Extract complete version history with publish dates for a package.
Useful for tracking release cadence and identifying active vs abandoned packages.
"""
resp = requests.get(f"https://registry.npmjs.org/{package_name}", timeout=20)
data = resp.json()
time_data = data.get("time", {})
versions = []
for version_str, publish_date in time_data.items():
if version_str in ("created", "modified"):
continue
versions.append({
"version": version_str,
"published_at": publish_date,
"is_prerelease": "-" in version_str,
})
# Sort by publish date
versions.sort(key=lambda x: x["published_at"])
return {
"package": package_name,
"created": time_data.get("created"),
"last_modified": time_data.get("modified"),
"total_versions": len(versions),
"stable_versions": sum(1 for v in versions if not v["is_prerelease"]),
"versions": versions,
}
timeline = get_version_timeline("react")
print(f"React first published: {timeline['created']}")
print(f"Total releases: {timeline['total_versions']}")
print()
print("Last 5 releases:")
for v in timeline["versions"][-5:]:
print(f" {v['version']} — {v['published_at']}")
Analyzing Release Velocity
def calculate_release_velocity(package_name, window_days=365):
"""
Calculate how many releases a package has per month over the last N days.
Useful for assessing project health and maintenance activity.
"""
from datetime import datetime, timezone, timedelta
timeline = get_version_timeline(package_name)
cutoff = datetime.now(timezone.utc) - timedelta(days=window_days)
recent_releases = [
v for v in timeline["versions"]
if datetime.fromisoformat(v["published_at"].replace("Z", "+00:00")) > cutoff
and not v["is_prerelease"]
]
months_in_window = window_days / 30
releases_per_month = len(recent_releases) / months_in_window
return {
"package": package_name,
"releases_last_year": len(recent_releases),
"releases_per_month": round(releases_per_month, 2),
"last_release": timeline["versions"][-1]["published_at"] if timeline["versions"] else None,
"health": "active" if releases_per_month > 0.5 else "low-activity" if releases_per_month > 0 else "abandoned",
}
for pkg in ["react", "vue", "angular", "svelte"]:
v = calculate_release_velocity(pkg)
print(f"{pkg}: {v['releases_per_month']:.1f} releases/month ({v['health']})")
Download Count APIs
npm download stats live on a separate API at api.npmjs.org:
import requests
def get_download_stats(package_name, period="last-month"):
"""
Fetch download count for a package.
period options: 'last-day', 'last-week', 'last-month', 'last-year',
or a date range like '2026-01-01:2026-03-31'
"""
resp = requests.get(
f"https://api.npmjs.org/downloads/point/{period}/{package_name}",
timeout=10,
)
data = resp.json()
return {
"package": data.get("package"),
"downloads": data.get("downloads", 0),
"period": period,
"start": data.get("start"),
"end": data.get("end"),
}
def get_daily_downloads(package_name, period="last-month"):
"""Fetch day-by-day download breakdown."""
resp = requests.get(
f"https://api.npmjs.org/downloads/range/{period}/{package_name}",
timeout=10,
)
data = resp.json()
return {
"package": data.get("package"),
"total": sum(d["downloads"] for d in data.get("downloads", [])),
"daily": data.get("downloads", []),
}
# Compare frameworks
stats = get_download_stats("react", "last-month")
print(f"React last month: {stats['downloads']:,} downloads")
# Custom date range
stats = get_download_stats("next", "2026-01-01:2026-03-31")
print(f"Next.js Q1 2026: {stats['downloads']:,} downloads")
Bulk Download Counts (up to 128 packages at once)
import requests
import time
def get_bulk_downloads(package_names, period="last-month"):
"""
Fetch download counts for multiple packages in a single API call.
The npm API supports up to 128 unscoped packages per request.
"""
all_stats = {}
# Split into chunks of 128
chunk_size = 128
for i in range(0, len(package_names), chunk_size):
chunk = package_names[i:i + chunk_size]
package_list = ",".join(chunk)
resp = requests.get(
f"https://api.npmjs.org/downloads/point/{period}/{package_list}",
timeout=15,
)
if resp.status_code == 200:
data = resp.json()
for pkg_name, stats in data.items():
if stats:
all_stats[pkg_name] = stats.get("downloads", 0)
else:
all_stats[pkg_name] = 0
if i + chunk_size < len(package_names):
time.sleep(0.5) # Polite delay between chunks
return all_stats
# Compare download counts across major frameworks
packages = [
"react", "vue", "angular", "@angular/core", "svelte",
"next", "nuxt", "gatsby", "remix", "astro",
"express", "fastify", "hapi", "koa", "nestjs",
]
stats = get_bulk_downloads(packages)
print("npm download rankings (last month):")
for pkg, count in sorted(stats.items(), key=lambda x: x[1], reverse=True):
bar = "=" * (count // 10_000_000)
print(f" {pkg:<20} {count:>15,} {bar}")
Dependency Analysis
Each version object includes dependencies, devDependencies, peerDependencies, and optionalDependencies:
import requests
from collections import Counter
def analyze_dependencies(package_name, version="latest"):
"""
Extract and categorize all dependencies for a specific package version.
"""
resp = requests.get(
f"https://registry.npmjs.org/{package_name}/{version}",
timeout=15,
)
data = resp.json()
return {
"package": package_name,
"version": data.get("version"),
"dependencies": data.get("dependencies", {}),
"devDependencies": data.get("devDependencies", {}),
"peerDependencies": data.get("peerDependencies", {}),
"optionalDependencies": data.get("optionalDependencies", {}),
"engines": data.get("engines", {}),
"total_runtime_deps": len(data.get("dependencies", {})),
}
deps = analyze_dependencies("next")
print(f"next@{deps['version']} runtime dependencies: {deps['total_runtime_deps']}")
for dep, version_range in sorted(deps["dependencies"].items()):
print(f" {dep}: {version_range}")
Finding the Most Common Dependencies Across the Ecosystem
import requests
import time
from collections import Counter
def find_most_depended_on(package_list):
"""
Analyze which packages are most commonly depended on
across a set of packages. Reveals the true backbone of the ecosystem.
"""
dep_counter = Counter()
for pkg_name in package_list:
try:
resp = requests.get(
f"https://registry.npmjs.org/{pkg_name}/latest",
timeout=10,
)
data = resp.json()
for dep in data.get("dependencies", {}):
dep_counter[dep] += 1
time.sleep(0.3)
except Exception as e:
print(f"Error fetching {pkg_name}: {e}")
return dep_counter.most_common(20)
# Example: check what the top CLI tools all depend on
cli_packages = ["eslint", "prettier", "typescript", "webpack", "vite", "rollup", "esbuild"]
common = find_most_depended_on(cli_packages)
print("Most common dependencies:")
for dep, count in common:
print(f" {dep}: depended on by {count} packages")
Package Search API
import requests
def search_packages(query, size=20, page=0, quality=0.65, popularity=0.98, maintenance=0.5):
"""
Search the npm registry with weighted ranking criteria.
quality: test coverage, linting, docs quality (0-1)
popularity: download count influence (0-1)
maintenance: recency, issue response rate (0-1)
"""
resp = requests.get(
"https://registry.npmjs.org/-/v1/search",
params={
"text": query,
"size": size,
"from": page * size,
"quality": quality,
"popularity": popularity,
"maintenance": maintenance,
},
timeout=15,
)
data = resp.json()
packages = []
for obj in data.get("objects", []):
pkg = obj["package"]
packages.append({
"name": pkg["name"],
"version": pkg["version"],
"description": pkg.get("description", ""),
"keywords": pkg.get("keywords", []),
"date": pkg.get("date"),
"author": pkg.get("author", {}).get("name"),
"publisher": pkg.get("publisher", {}).get("username"),
"score_final": round(obj["score"]["final"], 3),
"score_quality": round(obj["score"]["detail"]["quality"], 3),
"score_popularity": round(obj["score"]["detail"]["popularity"], 3),
"score_maintenance": round(obj["score"]["detail"]["maintenance"], 3),
"search_score": obj.get("searchScore"),
})
return packages
# Find web scraping packages on npm
scrapers = search_packages("web scraping", size=20, popularity=0.9)
print("Top npm scraping packages:")
for pkg in scrapers:
print(f" {pkg['name']} v{pkg['version']} (score: {pkg['score_final']})")
print(f" {pkg['description'][:80]}")
Async Collection for Large Datasets
For collecting data on thousands of packages, synchronous requests are too slow. Use async:
import asyncio
import aiohttp
import json
import time
from pathlib import Path
async def fetch_package_async(session, package_name, semaphore):
"""Fetch a single package with concurrency control."""
async with semaphore:
try:
async with session.get(
f"https://registry.npmjs.org/{package_name}",
headers={"Accept": "application/vnd.npm.install-v1+json"},
timeout=aiohttp.ClientTimeout(total=15),
) as resp:
if resp.status == 200:
return await resp.json()
else:
return None
except Exception as e:
print(f"Error fetching {package_name}: {e}")
return None
async def collect_packages_batch(package_names, concurrency=10, proxy_url=None):
"""
Collect metadata for many packages concurrently.
Uses a semaphore to limit concurrent requests and avoid overwhelming the API.
"""
connector = aiohttp.TCPConnector(limit=concurrency)
semaphore = asyncio.Semaphore(concurrency)
timeout = aiohttp.ClientTimeout(total=30)
headers = {
"User-Agent": "Mozilla/5.0 (compatible; npm-research-bot/1.0)",
}
proxy = proxy_url # e.g. "http://user:[email protected]:9000"
async with aiohttp.ClientSession(
connector=connector,
timeout=timeout,
headers=headers,
) as session:
tasks = [
fetch_package_async(session, name, semaphore)
for name in package_names
]
results = await asyncio.gather(*tasks, return_exceptions=True)
return [r for r in results if r and not isinstance(r, Exception)]
# Usage: collect top 1000 packages
def get_top_packages_by_search():
"""Get a list of popular package names via the search API."""
import requests
terms = ["utility", "http", "database", "testing", "cli", "parser", "formatter", "validation"]
seen = set()
names = []
for term in terms:
resp = requests.get(
"https://registry.npmjs.org/-/v1/search",
params={"text": term, "size": 250, "popularity": 1.0},
)
for obj in resp.json().get("objects", []):
name = obj["package"]["name"]
if name not in seen:
seen.add(name)
names.append(name)
return names
async def main():
package_names = get_top_packages_by_search()
print(f"Collecting data for {len(package_names)} packages...")
batch_size = 100
all_results = []
for i in range(0, len(package_names), batch_size):
batch = package_names[i:i + batch_size]
results = await collect_packages_batch(batch, concurrency=10)
all_results.extend(results)
print(f"Collected {len(all_results)}/{len(package_names)}")
await asyncio.sleep(1) # Brief pause between batches
return all_results
results = asyncio.run(main())
print(f"Total packages collected: {len(results)}")
Rate Limits, Proxy Rotation, and Scaling
The npm registry is generous with rate limits for individual package lookups. The download stats API (api.npmjs.org) is more restrictive — 429 responses appear when you hammer it.
For large-scale collection (pulling metadata for the top 10,000 packages with download trends), you need:
- Request pacing — stay under 5 requests/second per IP to the registry
- Proxy rotation — distribute requests across multiple IPs for very large jobs
- Caching — store results locally to avoid re-fetching unchanged data
Backoff-Aware Fetcher
import requests
import time
import random
def fetch_with_backoff(url, headers=None, proxy=None, max_retries=5):
"""
Fetch a URL with exponential backoff on rate limit responses.
Gives up after 5 retries (following API reliability rules).
"""
session = requests.Session()
if headers:
session.headers.update(headers)
if proxy:
session.proxies = {"http": proxy, "https": proxy}
for attempt in range(max_retries):
try:
resp = session.get(url, timeout=15)
if resp.status_code == 200:
return resp.json()
elif resp.status_code == 429:
wait = (2 ** attempt) + random.uniform(0, 1)
print(f"Rate limited. Waiting {wait:.1f}s (attempt {attempt + 1}/{max_retries})")
time.sleep(wait)
elif resp.status_code == 404:
return None # Package not found
else:
resp.raise_for_status()
except requests.exceptions.ConnectionError as e:
wait = 2 ** attempt
print(f"Connection error: {e}. Retrying in {wait}s...")
time.sleep(wait)
raise Exception(f"Failed after {max_retries} retries: {url}")
Proxy Integration with ThorData
When pulling data for thousands of packages — download trends, dependency trees, version histories — requests pile up fast. Rotating through residential proxies with ThorData keeps individual IP request counts low and avoids Cloudflare challenges that npm's CDN layer can occasionally throw at suspicious traffic patterns.
import requests
THORDATA_USER = "your_username"
THORDATA_PASS = "your_password"
def get_thordata_proxy(session_id=None):
"""
Get a ThorData residential proxy.
Pass session_id to reuse the same IP across multiple requests (sticky session).
Omit session_id for per-request IP rotation.
"""
if session_id:
user = f"{THORDATA_USER}-session-{session_id}"
else:
user = THORDATA_USER
proxy_url = f"http://{user}:{THORDATA_PASS}@proxy.thordata.com:9000"
return {"http": proxy_url, "https": proxy_url}
# Per-request rotation: different IP for each package
def fetch_npm_package_proxied(package_name):
proxies = get_thordata_proxy() # Fresh IP each call
resp = requests.get(
f"https://registry.npmjs.org/{package_name}",
proxies=proxies,
timeout=15,
)
return resp.json()
# Sticky session: same IP for a batch of related requests
def fetch_package_batch_sticky(package_names, session_id="batch_001"):
proxies = get_thordata_proxy(session_id=session_id)
session = requests.Session()
session.proxies = proxies
results = {}
for name in package_names:
try:
resp = session.get(f"https://registry.npmjs.org/{name}", timeout=15)
results[name] = resp.json()
time.sleep(0.3)
except Exception as e:
print(f"Error: {name} — {e}")
return results
Building a Full Package Dataset
Here is a complete script that collects the top packages by download count, enriches them with metadata, and saves a structured dataset:
import requests
import json
import time
import csv
from pathlib import Path
from datetime import datetime, timezone
def collect_npm_dataset(
search_terms=None,
per_term=50,
output_file="npm_packages.json",
proxy=None,
):
"""
Build a complete npm package dataset with metadata and download counts.
Saves incrementally to avoid data loss on interruption.
"""
if search_terms is None:
search_terms = [
"web framework", "database", "testing", "cli",
"http client", "authentication", "validation",
"parsing", "bundler", "state management",
]
packages = {}
session = requests.Session()
if proxy:
session.proxies = {"http": proxy, "https": proxy}
# Phase 1: Collect package names via search
print("Phase 1: Discovering packages via search...")
for term in search_terms:
resp = session.get(
"https://registry.npmjs.org/-/v1/search",
params={"text": term, "size": per_term, "popularity": 1.0},
timeout=15,
)
for obj in resp.json().get("objects", []):
pkg = obj["package"]
name = pkg["name"]
if name not in packages:
packages[name] = {
"name": name,
"version": pkg["version"],
"description": pkg.get("description", ""),
"keywords": pkg.get("keywords", []),
"date": pkg.get("date"),
"score": obj["score"]["final"],
}
print(f" Found {len(packages)} unique packages after searching '{term}'")
time.sleep(0.5)
# Phase 2: Enrich with download counts (bulk API)
print(f"\nPhase 2: Fetching download counts for {len(packages)} packages...")
names = list(packages.keys())
for i in range(0, len(names), 128):
batch = ",".join(names[i:i + 128])
resp = session.get(
f"https://api.npmjs.org/downloads/point/last-month/{batch}",
timeout=15,
)
if resp.status_code == 200:
for pkg_name, stats in resp.json().items():
if pkg_name in packages and stats:
packages[pkg_name]["monthly_downloads"] = stats.get("downloads", 0)
print(f" Processed {min(i + 128, len(names))}/{len(names)}")
time.sleep(0.5)
# Phase 3: Enrich with maintainer and repository info
print(f"\nPhase 3: Fetching detailed metadata...")
for i, name in enumerate(list(packages.keys())[:100]): # Limit to top 100 for demo
try:
resp = session.get(
f"https://registry.npmjs.org/{name}/latest",
timeout=10,
)
if resp.status_code == 200:
data = resp.json()
packages[name].update({
"license": data.get("license"),
"homepage": data.get("homepage"),
"repository": data.get("repository", {}).get("url") if isinstance(data.get("repository"), dict) else None,
"runtime_deps": len(data.get("dependencies", {})),
"engines": data.get("engines", {}),
"main_file": data.get("main"),
})
except Exception as e:
print(f" Error fetching details for {name}: {e}")
if (i + 1) % 10 == 0:
print(f" Enriched {i + 1} packages")
time.sleep(1)
# Save results
results = sorted(
list(packages.values()),
key=lambda x: x.get("monthly_downloads", 0),
reverse=True,
)
Path(output_file).write_text(json.dumps(results, indent=2))
print(f"\nDataset saved: {len(results)} packages → {output_file}")
# Also save as CSV for easy analysis
csv_file = output_file.replace(".json", ".csv")
if results:
with open(csv_file, "w", newline="") as f:
writer = csv.DictWriter(f, fieldnames=[
"name", "version", "description", "monthly_downloads",
"score", "license", "runtime_deps", "homepage",
])
writer.writeheader()
for pkg in results:
writer.writerow({
"name": pkg.get("name", ""),
"version": pkg.get("version", ""),
"description": pkg.get("description", "")[:100],
"monthly_downloads": pkg.get("monthly_downloads", 0),
"score": pkg.get("score", 0),
"license": pkg.get("license", ""),
"runtime_deps": pkg.get("runtime_deps", ""),
"homepage": pkg.get("homepage", ""),
})
print(f"CSV saved: {csv_file}")
return results
# Run the collection
dataset = collect_npm_dataset(
search_terms=["web framework", "http", "testing", "cli", "database"],
per_term=50,
output_file="npm_packages.json",
)
print(f"\nTop 10 by monthly downloads:")
for pkg in dataset[:10]:
print(f" {pkg['name']}: {pkg.get('monthly_downloads', 0):,} downloads/month")
What the Registry Does Not Expose
Some data points are not available in the npm registry itself:
- GitHub stars — not stored in the registry. Use the GitHub API (
api.github.com/repos/owner/repo) by parsing the repository URL - Bundle size — use the Bundlephobia API:
https://bundlephobia.com/api/[email protected]for minified and gzipped sizes - Security vulnerabilities — npm audit data is not publicly accessible via API. Use the GitHub Advisory Database API (
https://api.github.osv.dev/v1/query) instead - Actual usage in projects — download counts include CI/CD pipelines, mirrors, and bots. They are directional signals, not precise real-world usage metrics
- Reverse dependencies — which packages depend on a given package. Use the npm website search or parse the bulk
all_docsendpoint
Getting Bundle Size via Bundlephobia
import requests
def get_bundle_size(package_name, version=None):
"""
Get minified and gzipped bundle size from Bundlephobia.
This is a critical metric the npm registry does not provide.
"""
pkg = f"{package_name}@{version}" if version else package_name
resp = requests.get(
f"https://bundlephobia.com/api/size",
params={"package": pkg},
headers={"User-Agent": "Mozilla/5.0"},
timeout=20,
)
if resp.status_code == 200:
data = resp.json()
return {
"package": data.get("name"),
"version": data.get("version"),
"size_bytes": data.get("size"),
"gzip_bytes": data.get("gzip"),
"size_kb": round(data.get("size", 0) / 1024, 1),
"gzip_kb": round(data.get("gzip", 0) / 1024, 1),
"has_side_effects": data.get("hasSideEffects"),
"tree_shakeable": not data.get("hasSideEffects", True),
}
return None
# Compare framework bundle sizes
for pkg in ["react", "vue", "svelte", "solid-js", "preact"]:
info = get_bundle_size(pkg)
if info:
print(f"{pkg}: {info['gzip_kb']} KB gzipped ({info['size_kb']} KB min)")
Practical Use Cases for npm Data
The npm registry dataset is surprisingly valuable for research and product development:
JavaScript Ecosystem Trends: Track the rise and fall of frameworks by monitoring monthly download trends. When a package crosses 1M downloads/month, it has crossed the mainstream threshold.
Dependency Auditing: Build tooling that alerts you when a package in your supply chain has unusual version publish patterns (multiple versions in 24 hours can indicate a hijacking attempt).
Library Selection: Compare maintenance activity, download velocity, and dependency count before choosing between competing packages.
Competitive Intelligence: Track download growth rates for packages in the same category (e.g., all HTTP clients) to spot emerging trends before they hit mainstream tech media.
npm Plugin Ecosystems: Map the plugin ecosystem around major frameworks (Webpack plugins, ESLint rules, Babel transforms) to understand how healthy a framework's community is.
The npm registry is one of the most scraping-friendly data sources available. No auth walls, clean JSON, reasonable rate limits, and 2.5+ million packages worth of ecosystem data. For most JavaScript ecosystem research, you will not need to touch any HTML at all — the registry API gives you everything you need.