Scrape Amazon Product Reviews in Python: Ratings, ASIN Data & Anti-Bot Bypass (2026)
Scrape Amazon Product Reviews in Python: Ratings, ASIN Data & Anti-Bot Bypass (2026)
Amazon is one of the hardest sites to scrape in 2026. They run CloudFront bot detection, behavioral fingerprinting, and CAPTCHA challenges that trigger after just a handful of requests from the same IP. If you're trying to pull product reviews, star ratings, or verified purchase data at any real scale — you need a proper setup.
This guide covers everything: scraping individual reviews and star breakdowns, extracting ASINs from search, handling Amazon's anti-bot stack, pagination, data storage, and a full production pipeline.
What You Can Extract
Each Amazon product review page gives you:
- Star rating (1-5) and review title
- Review body text (can be long — Amazon doesn't truncate in the raw HTML)
- Verified Purchase badge (yes/no)
- Reviewer name and profile link
- Review date and location ("Reviewed in the United States")
- Helpful vote count
- Reviewer's other reviews (profile link)
- Images attached to reviews (URLs)
- Video reviews (indicator)
- VINE Voice badge (high-trust reviewer program)
From product detail pages you also get the star histogram — the breakdown of 1-5 star ratings as percentages — without needing to scrape individual reviews.
The ASIN is the key. Every product has one, and you can construct review URLs directly: https://www.amazon.com/product-reviews/{ASIN}
Dependencies
pip install playwright playwright-stealth curl-cffi beautifulsoup4 lxml httpx
playwright install chromium
Method 1: curl_cffi (Fast, No Browser Required)
For many Amazon pages, curl_cffi with Chrome TLS impersonation is enough — no Playwright overhead:
import re
import json
import time
import random
import sqlite3
from datetime import datetime
from pathlib import Path
from bs4 import BeautifulSoup
from curl_cffi import requests as curl_requests
HEADERS = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
"Accept-Language": "en-US,en;q=0.9",
"Accept-Encoding": "gzip, deflate, br",
"Referer": "https://www.amazon.com/",
"DNT": "1",
"Sec-Fetch-Dest": "document",
"Sec-Fetch-Mode": "navigate",
"Sec-Fetch-Site": "same-origin",
"Sec-Fetch-User": "?1",
"Upgrade-Insecure-Requests": "1",
}
USER_AGENTS = [
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:125.0) Gecko/20100101 Firefox/125.0",
]
def create_session(proxy_url: str = None) -> curl_requests.Session:
"""Create a session with Chrome TLS impersonation."""
session = curl_requests.Session()
if proxy_url:
session.proxies = {"https": proxy_url}
return session
def fetch_amazon_page(
url: str,
session: curl_requests.Session,
retries: int = 4,
) -> str | None:
"""Fetch an Amazon page with retries and CAPTCHA detection."""
for attempt in range(retries):
headers = {**HEADERS, "User-Agent": random.choice(USER_AGENTS)}
try:
resp = session.get(
url,
impersonate="chrome124",
headers=headers,
timeout=20,
allow_redirects=True,
)
if resp.status_code == 200:
# Check for CAPTCHA
if any(marker in resp.text for marker in [
"Type the characters you see",
"Robot Check",
"Sorry, we just need to make sure you're not a robot",
"/errors/validateCaptcha",
]):
wait = random.uniform(15, 30) * (attempt + 1)
print(f" CAPTCHA detected, waiting {wait:.0f}s before retry {attempt+1}")
time.sleep(wait)
continue
return resp.text
elif resp.status_code == 429:
wait = (2 ** attempt) * random.uniform(5, 10)
print(f" Rate limited (429), waiting {wait:.0f}s")
time.sleep(wait)
elif resp.status_code in (503, 504):
wait = random.uniform(5, 15)
time.sleep(wait)
else:
print(f" HTTP {resp.status_code} on attempt {attempt+1}")
return None
except Exception as e:
print(f" Request error on attempt {attempt+1}: {e}")
time.sleep(random.uniform(2, 6))
return None
def scrape_reviews_http(
asin: str,
session: curl_requests.Session,
max_pages: int = 10,
sort_by: str = "recent", # "recent" or "helpful"
star_filter: int = None, # None = all, or 1-5
) -> list[dict]:
"""
Scrape Amazon product reviews using curl_cffi HTTP requests.
Faster than Playwright but may hit more CAPTCHAs.
Args:
asin: Amazon ASIN
session: curl_cffi Session
max_pages: Max review pages to scrape (10 reviews each)
sort_by: "recent" or "helpful"
star_filter: Filter to a specific star rating (1-5), or None for all
"""
all_reviews = []
base_url = f"https://www.amazon.com/product-reviews/{asin}"
for page_num in range(1, max_pages + 1):
params = {
"pageNumber": page_num,
"sortBy": sort_by,
"reviewerType": "all_reviews",
}
if star_filter:
params["filterByStar"] = f"{star_filter}_star"
param_str = "&".join(f"{k}={v}" for k, v in params.items())
url = f"{base_url}?{param_str}"
html = fetch_amazon_page(url, session)
if not html:
break
soup = BeautifulSoup(html, "lxml")
reviews = parse_review_page(soup, asin)
if not reviews:
# Check if we're out of reviews (empty page or no more)
no_reviews_el = soup.select_one(".a-section.review-filter-info, .a-row.a-spacing-large")
if no_reviews_el and "no customer reviews" in no_reviews_el.get_text().lower():
break
if page_num > 1:
break
all_reviews.extend(reviews)
print(f" Page {page_num}: {len(reviews)} reviews (total {len(all_reviews)})")
# Randomized delay — Amazon watches timing patterns
time.sleep(random.uniform(5, 12))
return all_reviews
Parsing Review Pages
def parse_review_page(soup: BeautifulSoup, asin: str) -> list[dict]:
"""Parse all reviews from a single review page."""
reviews = []
for review_el in soup.select("[data-hook='review']"):
try:
review = parse_single_review(review_el, asin)
if review:
reviews.append(review)
except Exception:
continue
return reviews
def parse_single_review(el, asin: str) -> dict | None:
"""Parse a single review element."""
# Unique review ID
review_id = el.get("id", "")
# Star rating
star_el = el.select_one("[data-hook='review-star-rating'], [data-hook='cmps-review-star-rating']")
stars = None
if star_el:
star_text = star_el.get_text(strip=True)
star_match = re.search(r"([\d.]+)", star_text)
stars = float(star_match.group(1)) if star_match else None
# Review title
title_el = el.select_one("[data-hook='review-title']")
title = None
if title_el:
# Remove the embedded star rating text from title
for child in title_el.select("[data-hook='review-star-rating']"):
child.decompose()
title = title_el.get_text(strip=True)
# Review body
body_el = el.select_one("[data-hook='review-body'] span:not([data-hook])")
if not body_el:
body_el = el.select_one("[data-hook='review-body']")
body = body_el.get_text(strip=True) if body_el else None
# Verified Purchase
verified_el = el.select_one("[data-hook='avp-badge'], [data-hook='cm-cr-review-content-verified']")
verified = verified_el is not None
# Reviewer name and profile link
reviewer_el = el.select_one(".a-profile-name")
reviewer_name = reviewer_el.get_text(strip=True) if reviewer_el else None
profile_link_el = el.select_one("a.a-profile, [data-hook='genome-widget'] a")
profile_link = profile_link_el.get("href") if profile_link_el else None
# Review date and location
date_el = el.select_one("[data-hook='review-date']")
date_raw = date_el.get_text(strip=True) if date_el else None
review_date = None
review_country = None
if date_raw:
# "Reviewed in the United States on March 15, 2026"
country_match = re.search(r"Reviewed in (.+?) on (.+)", date_raw)
if country_match:
review_country = country_match.group(1)
review_date = country_match.group(2)
else:
review_date = date_raw
# Helpful votes
helpful_el = el.select_one("[data-hook='helpful-vote-statement']")
helpful_count = 0
if helpful_el:
helpful_text = helpful_el.get_text(strip=True)
h_match = re.search(r"(\d+)", helpful_text.replace(",", ""))
if h_match:
helpful_count = int(h_match.group(1))
elif "One" in helpful_text:
helpful_count = 1
# Images attached to review
images = []
for img_el in el.select("[data-hook='review-image-tile'] img, .review-image-tile img"):
src = img_el.get("src", "")
# Get full-size version
full_src = re.sub(r"\._[A-Z]{2}\d+_\.", ".", src)
if full_src:
images.append(full_src)
# Video review indicator
video_el = el.select_one("[data-hook='review-video-thumbnail'], .cr-video-review")
has_video = video_el is not None
# VINE Voice badge
vine_el = el.select_one("[data-hook='vine-customer-review-text'], .a-color-secondary.a-text-bold")
is_vine = vine_el is not None and "vine" in (vine_el.get_text() or "").lower()
# Variant purchased (if product has variants)
variant_el = el.select_one("[data-hook='format-strip-linkless'], [data-hook='format-strip']")
variant = variant_el.get_text(strip=True) if variant_el else None
if not (title or body):
return None
return {
"review_id": review_id,
"asin": asin,
"stars": stars,
"title": title,
"body": body,
"verified_purchase": verified,
"reviewer_name": reviewer_name,
"profile_link": f"https://www.amazon.com{profile_link}" if profile_link and profile_link.startswith("/") else profile_link,
"review_date": review_date,
"review_country": review_country,
"helpful_votes": helpful_count,
"images": images,
"has_video": has_video,
"is_vine": is_vine,
"variant_purchased": variant,
}
Scraping the Star Histogram
The overall rating distribution (how many 1-star, 2-star, etc.) is on the main product page, not the review pages:
def scrape_rating_histogram(
asin: str,
session: curl_requests.Session,
) -> dict:
"""
Scrape the star rating histogram for a product.
Returns: {"5_star": 73, "4_star": 12, "3_star": 7, "2_star": 4, "1_star": 4}
(percentages)
"""
url = f"https://www.amazon.com/dp/{asin}"
html = fetch_amazon_page(url, session)
if not html:
return {}
soup = BeautifulSoup(html, "lxml")
histogram = {}
# Histogram table
for row in soup.select("table#histogramTable tr, [data-hook='rating-out-of-text']"):
row_text = row.get_text()
for star_num in range(1, 6):
if f"{star_num} star" in row_text.lower():
pct_match = re.search(r"(\d+)%", row_text)
if pct_match:
histogram[f"{star_num}_star"] = int(pct_match.group(1))
# Alternative: aria-label on histogram bars
if not histogram:
for bar in soup.select("[data-hook='rating-histogram'] .a-meter"):
aria = bar.get("aria-label", "")
star_match = re.search(r"(\d+) star", aria)
pct_match = re.search(r"(\d+) percent", aria)
if star_match and pct_match:
histogram[f"{star_match.group(1)}_star"] = int(pct_match.group(1))
# Overall rating
overall_el = soup.select_one("#acrPopover")
overall_rating = None
if overall_el:
r_match = re.search(r"([\d.]+)", overall_el.get("title", ""))
overall_rating = float(r_match.group(1)) if r_match else None
# Total review count
count_el = soup.select_one("#acrCustomerReviewText")
total_reviews = None
if count_el:
c_match = re.search(r"([\d,]+)", count_el.get_text())
if c_match:
total_reviews = int(c_match.group(1).replace(",", ""))
return {
"asin": asin,
"overall_rating": overall_rating,
"total_reviews": total_reviews,
"histogram": histogram,
}
Method 2: Playwright (For Harder Targets)
When curl_cffi hits too many CAPTCHAs, fall back to full browser automation:
import asyncio
from playwright.async_api import async_playwright, Page
from playwright_stealth import stealth_async
STEALTH_SCRIPT = """
Object.defineProperty(navigator, 'webdriver', { get: () => undefined });
Object.defineProperty(navigator, 'plugins', {
get: () => [
{ name: 'Chrome PDF Plugin', filename: 'internal-pdf-viewer' },
{ name: 'Chrome PDF Viewer', filename: 'mhjfbmdgcfjbbpaeojofohoefgiehjai' },
]
});
window.chrome = { runtime: {}, loadTimes: function() {}, csi: function() {}, app: {} };
"""
async def scrape_reviews_playwright(
asin: str,
proxy_url: str = None,
max_pages: int = 5,
) -> list[dict]:
"""Scrape Amazon reviews using Playwright for harder anti-bot targets."""
all_reviews = []
async with async_playwright() as p:
browser = await p.chromium.launch(
headless=True,
args=[
"--no-sandbox",
"--disable-blink-features=AutomationControlled",
"--disable-dev-shm-usage",
],
proxy={"server": proxy_url} if proxy_url else None,
)
context = await browser.new_context(
viewport={"width": 1440, "height": 900},
user_agent=random.choice(USER_AGENTS),
locale="en-US",
timezone_id="America/New_York",
)
await context.add_init_script(STEALTH_SCRIPT)
page = await context.new_page()
await stealth_async(page)
# Warm up session with Amazon homepage
await page.goto("https://www.amazon.com/", wait_until="domcontentloaded")
await asyncio.sleep(random.uniform(2, 4))
await page.evaluate("window.scrollTo(0, 400)")
await asyncio.sleep(random.uniform(1, 2))
for page_num in range(1, max_pages + 1):
url = (
f"https://www.amazon.com/product-reviews/{asin}"
f"?pageNumber={page_num}&sortBy=recent&reviewerType=all_reviews"
)
try:
await page.goto(url, wait_until="domcontentloaded", timeout=30000)
await page.wait_for_selector("[data-hook='review']", timeout=10000)
except Exception as e:
print(f" Page {page_num} load error: {e}")
break
# Simulate reading
await page.evaluate("window.scrollTo(0, document.body.scrollHeight * 0.3)")
await asyncio.sleep(random.uniform(1, 2))
await page.evaluate("window.scrollTo(0, document.body.scrollHeight * 0.7)")
await asyncio.sleep(random.uniform(1, 2))
html = await page.content()
soup = BeautifulSoup(html, "lxml")
reviews = parse_review_page(soup, asin)
if not reviews:
break
all_reviews.extend(reviews)
print(f" Page {page_num}: {len(reviews)} reviews")
await asyncio.sleep(random.uniform(5, 12))
await browser.close()
return all_reviews
Extracting ASINs
Before scraping reviews, you need ASINs. Pull them from search results or category pages:
def extract_asins_from_search(
query: str,
session: curl_requests.Session,
max_pages: int = 3,
) -> list[str]:
"""Extract ASINs from Amazon search results."""
asins = []
for page_num in range(1, max_pages + 1):
url = f"https://www.amazon.com/s?k={query.replace(' ', '+')}&page={page_num}"
html = fetch_amazon_page(url, session)
if not html:
break
soup = BeautifulSoup(html, "lxml")
# ASINs are in data-asin attributes on search result items
for el in soup.select("[data-asin]"):
asin = el.get("data-asin", "").strip()
if asin and len(asin) == 10 and asin.isalnum():
asins.append(asin)
print(f" Search page {page_num}: found {len(asins)} ASINs so far")
time.sleep(random.uniform(5, 10))
return list(set(asins)) # Deduplicate
def extract_asin_from_url(url: str) -> str | None:
"""Extract ASIN from an Amazon product URL."""
patterns = [
r"/dp/([A-Z0-9]{10})",
r"/gp/product/([A-Z0-9]{10})",
r"/product-reviews/([A-Z0-9]{10})",
r"asin=([A-Z0-9]{10})",
]
for pattern in patterns:
match = re.search(pattern, url)
if match:
return match.group(1)
return None
Proxy Configuration
THORDATA_USER = "your_username"
THORDATA_PASS = "your_password"
THORDATA_HOST = "proxy.thordata.com"
THORDATA_PORT = 9000
def get_proxy(sticky: bool = False, session_id: str = None) -> str:
"""
Build a ThorData residential proxy URL.
Sticky sessions keep the same IP for a scraping session — useful
for Amazon since they track session continuity.
"""
user = THORDATA_USER
if sticky and session_id:
user += f"_session-{session_id}"
return f"http://{user}:{THORDATA_PASS}@{THORDATA_HOST}:{THORDATA_PORT}"
def make_session_for_asin(asin: str) -> curl_requests.Session:
"""
Create a sticky-session proxy for scraping one product's reviews.
Using the same IP for all pages of one product looks more natural.
"""
return create_session(proxy_url=get_proxy(sticky=True, session_id=f"reviews_{asin}"))
For high-volume Amazon scraping, ThorData's residential proxy pool handles the IP rotation automatically. Their sticky session mode keeps you on the same IP for the duration of a session — which matters for Amazon since rapid IP changes mid-session are a bot signal.
SQLite Storage
DB_PATH = Path("amazon_reviews.db")
def init_db() -> sqlite3.Connection:
"""Initialize reviews database."""
conn = sqlite3.connect(DB_PATH)
conn.executescript("""
CREATE TABLE IF NOT EXISTS products (
asin TEXT PRIMARY KEY,
title TEXT,
brand TEXT,
overall_rating REAL,
total_reviews INTEGER,
histogram TEXT,
updated_at TEXT DEFAULT (datetime('now'))
);
CREATE TABLE IF NOT EXISTS reviews (
review_id TEXT,
asin TEXT NOT NULL,
stars REAL,
title TEXT,
body TEXT,
verified_purchase INTEGER DEFAULT 0,
reviewer_name TEXT,
profile_link TEXT,
review_date TEXT,
review_country TEXT,
helpful_votes INTEGER DEFAULT 0,
has_video INTEGER DEFAULT 0,
is_vine INTEGER DEFAULT 0,
variant_purchased TEXT,
images TEXT,
scraped_at TEXT DEFAULT (datetime('now')),
PRIMARY KEY (review_id, asin)
);
CREATE INDEX IF NOT EXISTS idx_reviews_asin ON reviews(asin);
CREATE INDEX IF NOT EXISTS idx_reviews_stars ON reviews(asin, stars);
CREATE INDEX IF NOT EXISTS idx_reviews_verified ON reviews(asin, verified_purchase);
""")
conn.commit()
return conn
def save_review(conn: sqlite3.Connection, review: dict):
"""Save a single review."""
conn.execute("""
INSERT OR IGNORE INTO reviews
(review_id, asin, stars, title, body, verified_purchase, reviewer_name,
profile_link, review_date, review_country, helpful_votes, has_video,
is_vine, variant_purchased, images)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
""", (
review.get("review_id"), review.get("asin"),
review.get("stars"), review.get("title"), review.get("body"),
1 if review.get("verified_purchase") else 0,
review.get("reviewer_name"), review.get("profile_link"),
review.get("review_date"), review.get("review_country"),
review.get("helpful_votes", 0),
1 if review.get("has_video") else 0,
1 if review.get("is_vine") else 0,
review.get("variant_purchased"),
json.dumps(review.get("images", [])),
))
conn.commit()
def save_histogram(conn: sqlite3.Connection, data: dict):
"""Save product rating histogram."""
conn.execute("""
INSERT OR REPLACE INTO products (asin, overall_rating, total_reviews, histogram)
VALUES (?, ?, ?, ?)
""", (
data.get("asin"), data.get("overall_rating"),
data.get("total_reviews"), json.dumps(data.get("histogram", {})),
))
conn.commit()
Review Analysis Queries
def analyze_reviews(conn: sqlite3.Connection, asin: str) -> dict:
"""Run analytics on scraped reviews for a product."""
# Summary stats
summary = conn.execute("""
SELECT
COUNT(*) as total,
AVG(stars) as avg_stars,
SUM(CASE WHEN verified_purchase = 1 THEN 1 ELSE 0 END) as verified_count,
AVG(helpful_votes) as avg_helpful,
SUM(CASE WHEN has_video = 1 THEN 1 ELSE 0 END) as video_count,
SUM(LENGTH(images) > 2) as with_images_count
FROM reviews
WHERE asin = ?
""", (asin,)).fetchone()
# Star distribution
star_dist = conn.execute("""
SELECT CAST(stars AS INTEGER) as star, COUNT(*) as count
FROM reviews
WHERE asin = ? AND stars IS NOT NULL
GROUP BY CAST(stars AS INTEGER)
ORDER BY star DESC
""", (asin,)).fetchall()
# Most helpful reviews (top 5)
top_helpful = conn.execute("""
SELECT stars, title, review_date, helpful_votes, verified_purchase
FROM reviews
WHERE asin = ?
ORDER BY helpful_votes DESC
LIMIT 5
""", (asin,)).fetchall()
# Recent trend (last 30 reviews vs previous 30)
recent = conn.execute("""
SELECT AVG(stars) FROM reviews WHERE asin = ?
ORDER BY scraped_at DESC LIMIT 30
""", (asin,)).fetchone()
previous = conn.execute("""
SELECT AVG(stars) FROM (
SELECT stars FROM reviews WHERE asin = ?
ORDER BY scraped_at DESC LIMIT 60
) WHERE rowid > 30
""", (asin,)).fetchone()
# Common words in negative reviews
negative_reviews = conn.execute("""
SELECT body FROM reviews
WHERE asin = ? AND stars <= 2 AND body IS NOT NULL
LIMIT 50
""", (asin,)).fetchall()
return {
"asin": asin,
"total_scraped": summary[0] if summary else 0,
"avg_stars": round(summary[1], 2) if summary and summary[1] else 0,
"verified_pct": round(summary[2] / summary[0] * 100, 1) if summary and summary[0] else 0,
"avg_helpful_votes": round(summary[3] or 0, 1),
"star_distribution": {f"{r[0]}_star": r[1] for r in star_dist},
"top_helpful_reviews": [
{"stars": r[0], "title": r[1], "date": r[2], "helpful": r[3], "verified": bool(r[4])}
for r in top_helpful
],
"recent_avg_stars": round(recent[0], 2) if recent and recent[0] else 0,
"negative_review_count": len(negative_reviews),
}
Full Pipeline
def run_review_pipeline(
asins: list[str],
proxy_url: str = None,
max_pages_per_product: int = 10,
star_filters: list[int] = None,
db_path: str = "amazon_reviews.db",
) -> dict:
"""
Full review scraping pipeline.
Args:
asins: List of ASINs to scrape
proxy_url: Proxy URL (residential recommended)
max_pages_per_product: Max review pages per product (10 reviews each)
star_filters: Optionally limit to specific star ratings (e.g. [1, 2, 5])
db_path: SQLite database path
"""
conn = init_db()
stats = {"products": 0, "reviews": 0, "errors": 0}
global_session = create_session(proxy_url)
for asin in asins:
print(f"\n=== ASIN: {asin} ===")
# Get histogram first (1 request, product page)
print(" Fetching rating histogram...")
hist_session = create_session(proxy_url)
histogram = scrape_rating_histogram(asin, hist_session)
if histogram:
save_histogram(conn, histogram)
print(f" Overall: {histogram.get('overall_rating')}/5 ({histogram.get('total_reviews')} reviews)")
time.sleep(random.uniform(5, 10))
# Scrape reviews
if star_filters:
for star_val in star_filters:
print(f" Scraping {star_val}-star reviews...")
review_session = make_session_for_asin(f"{asin}_{star_val}")
reviews = scrape_reviews_http(
asin, review_session,
max_pages=max_pages_per_product,
star_filter=star_val,
)
for r in reviews:
save_review(conn, r)
stats["reviews"] += len(reviews)
print(f" Saved {len(reviews)} {star_val}-star reviews")
time.sleep(random.uniform(8, 15))
else:
print(" Scraping all reviews...")
review_session = make_session_for_asin(asin)
reviews = scrape_reviews_http(
asin, review_session,
max_pages=max_pages_per_product,
)
for r in reviews:
save_review(conn, r)
stats["reviews"] += len(reviews)
print(f" Saved {len(reviews)} reviews")
stats["products"] += 1
# Analyze
analysis = analyze_reviews(conn, asin)
print(f" Analysis: {analysis['total_scraped']} reviews, "
f"{analysis['avg_stars']}★ avg, "
f"{analysis['verified_pct']}% verified")
time.sleep(random.uniform(10, 20))
conn.close()
print(f"\nPipeline complete: {stats}")
return stats
# Usage
asins = ["B0D5CSL2FN", "B0CRMZD9MH"]
run_review_pipeline(
asins,
proxy_url="http://user:[email protected]:9000",
max_pages_per_product=5,
star_filters=[1, 2, 5], # Focus on extreme reviews
)
Rate Limiting Best Practices
- 5-12 second delays between review pages minimum. Amazon tracks timing patterns closely.
- 10-20 second delays between different products.
- Use sticky sessions (same IP across all pages of one product). IP changes mid-product look like bot handoffs.
- Rotate user agents between products, not within a single product's session.
- Don't run concurrent sessions for the same product — that's the clearest bot signal.
- Run during US business hours if scraping amazon.com. Residential IPs active at 3 AM look suspicious.
- Cap at 3-5 concurrent products maximum, and only with a solid residential proxy pool.
Legal Note
Amazon's terms of service prohibit scraping. Use this for personal research, academic analysis, competitive intelligence, and internal business analysis. Do not redistribute raw scraped review data commercially or in bulk. Consult your legal counsel for commercial applications.