How to Scrape Yelp Business Reviews with Python (2026)
How to Scrape Yelp Business Reviews with Python (2026)
Yelp has over 200 million reviews across millions of businesses. If you're doing competitive analysis, building a local business dataset, or researching consumer sentiment, that data is valuable. The challenge is getting it out cleanly at scale.
Yelp offers a Fusion API (free tier: 5,000 calls/day) that gives you business listings, ratings, and basic info. But it deliberately withholds individual review text — you only get 3 review excerpts per business. For full review content, you need to scrape the HTML directly.
This guide covers both approaches, including: API setup, HTML scraping, pagination, proxy rotation for anti-bot bypass, photo extraction, and storing results in SQLite.
Environment Setup
pip install httpx requests beautifulsoup4 lxml
Approach 1: Yelp Fusion API
The API is the clean path for business listings and metadata. Sign up at Yelp Fusion to get a free API key. The free tier gives you 5,000 calls/day, which is sufficient for most projects.
Business Search
import httpx
import time
import json
API_KEY = "YOUR_YELP_API_KEY"
BASE = "https://api.yelp.com/v3"
def make_client(api_key: str = API_KEY):
return httpx.Client(
headers={"Authorization": f"Bearer {api_key}"},
timeout=15,
)
client = make_client()
def search_businesses(term: str, location: str, limit: int = 50,
sort_by: str = "review_count", categories: str = None) -> list:
"""
Search Yelp businesses.
sort_by: best_match, rating, review_count, distance
Max 50 per request, 1000 max total offset.
"""
businesses = []
total_limit = min(limit, 1000)
for offset in range(0, total_limit, 50):
params = {
"term": term,
"location": location,
"limit": min(50, total_limit - offset),
"offset": offset,
"sort_by": sort_by,
}
if categories:
params["categories"] = categories
resp = client.get(f"{BASE}/businesses/search", params=params)
resp.raise_for_status()
data = resp.json()
batch = data.get("businesses", [])
if not batch:
break
businesses.extend(batch)
if len(businesses) >= data.get("total", 0):
break
time.sleep(0.2)
return businesses[:limit]
# Example: find top pizza restaurants in NYC
restaurants = search_businesses("pizza", "New York, NY", limit=100, sort_by="rating")
for biz in restaurants[:5]:
print(f"{biz['name']} — {biz['rating']}★ ({biz['review_count']} reviews)")
print(f" Address: {', '.join(biz['location'].get('display_address', []))}")
print(f" Price: {biz.get('price', 'N/A')}")
Business Details + Review Excerpts
def get_business_details(business_id: str) -> dict:
"""Get full business profile including hours, attributes, photos."""
resp = client.get(f"{BASE}/businesses/{business_id}")
resp.raise_for_status()
return resp.json()
def get_review_excerpts(business_id: str) -> list:
"""
Returns up to 3 review excerpts (API limitation).
These are truncated — not the full review text.
"""
resp = client.get(
f"{BASE}/businesses/{business_id}/reviews",
params={"limit": 3, "sort_by": "yelp_sort"}
)
resp.raise_for_status()
return resp.json().get("reviews", [])
# Full business profile
details = get_business_details(restaurants[0]["id"])
print(f"\n{details['name']}")
print(f"Categories: {', '.join(c['title'] for c in details['categories'])}")
print(f"Price: {details.get('price', 'N/A')}")
print(f"Phone: {details.get('display_phone', 'N/A')}")
print(f"Hours open today: {details.get('hours', [{}])[0].get('is_open_now', 'N/A')}")
reviews = get_review_excerpts(restaurants[0]["id"])
for r in reviews:
print(f"\n {r['rating']}★ — {r['text'][:150]}...")
print(f" By: {r['user']['name']} | {r['time_created']}")
What the API Gives You
The Fusion API provides: - Business name, address, coordinates, phone number - Overall rating and review count - Categories, price range, attributes - Up to 3 photos - Hours of operation - Is open now status - URL and yelp page link
What it won't give you: - Full review text (only 3 truncated excerpts) - All reviewer profiles - Historical rating changes - Photo captions and all user-submitted photos - Individual star breakdowns (1★ vs 5★ count)
For full reviews, you need to scrape the HTML.
Approach 2: Direct HTML Scraping
Yelp's review pages are primarily server-rendered, which makes them scrapable without a headless browser for most cases. Review content is embedded in the initial HTML as JSON-LD and standard DOM elements.
import httpx
from bs4 import BeautifulSoup
import json
import re
import time
import random
UA_POOL = [
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:125.0) Gecko/20100101 Firefox/125.0",
]
def make_scrape_session(proxy_url: str = None):
session = httpx.Client(
headers={
"User-Agent": random.choice(UA_POOL),
"Accept-Language": "en-US,en;q=0.9",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Accept-Encoding": "gzip, deflate, br",
"DNT": "1",
"Referer": "https://www.yelp.com/",
},
proxy=proxy_url,
timeout=25,
follow_redirects=True,
)
return session
def parse_reviews_from_html(soup: BeautifulSoup) -> list:
"""Extract reviews from Yelp HTML using JSON-LD and DOM fallbacks."""
reviews = []
# Strategy 1: JSON-LD structured data (most reliable, least fragile)
for script in soup.find_all("script", type="application/ld+json"):
try:
ld = json.loads(script.string)
biz_type = ld.get("@type", "")
if biz_type in ["LocalBusiness", "Restaurant", "FoodEstablishment",
"Hotel", "LodgingBusiness"]:
for rev in ld.get("review", []):
reviews.append({
"author": rev.get("author", {}).get("name"),
"rating": rev.get("reviewRating", {}).get("ratingValue"),
"date": rev.get("datePublished"),
"text": rev.get("description", ""),
"source": "json-ld",
})
except (json.JSONDecodeError, TypeError, AttributeError):
continue
if reviews:
return reviews
# Strategy 2: Look for review data in window.__INITIAL_STATE__ or similar
for script in soup.find_all("script"):
if script.string and "reviewText" in str(script.string):
# Extract JSON from inline script
match = re.search(r"window\.yelp\.rl_init.*?({.*?});", str(script.string), re.DOTALL)
if match:
try:
data = json.loads(match.group(1))
# Parse structure — varies by Yelp version
pass
except json.JSONDecodeError:
pass
# Strategy 3: DOM selectors as last resort
# Look for review wrappers — Yelp uses generated class names that change,
# but aria attributes and data-review-id are more stable
for review_div in soup.find_all("div", attrs={"data-review-id": True}):
review_id = review_div.get("data-review-id")
# Rating from aria-label
rating = None
rating_el = review_div.find(attrs={"aria-label": re.compile(r"\d star rating")})
if rating_el:
match = re.search(r"(\d)", rating_el.get("aria-label", ""))
if match:
rating = int(match.group(1))
# Review text — look for long text blocks
text = None
for p in review_div.find_all("p"):
text_candidate = p.get_text(strip=True)
if len(text_candidate) > 50:
text = text_candidate
break
# Author
author = None
author_el = review_div.find("a", href=re.compile(r"/user_details"))
if author_el:
author = author_el.get_text(strip=True)
# Date
date = None
date_el = review_div.find("span", string=re.compile(r"\d{1,2}/\d{1,2}/\d{4}"))
if date_el:
date = date_el.get_text(strip=True)
if text or rating:
reviews.append({
"review_id": review_id,
"author": author,
"rating": rating,
"date": date,
"text": text,
"source": "dom",
})
return reviews
Handling Pagination
Yelp uses offset-based pagination on review pages:
def scrape_reviews(business_url: str, max_pages: int = 10,
proxy_url: str = None) -> list:
"""Scrape full review text from Yelp business page."""
all_reviews = []
session = make_scrape_session(proxy_url)
# First: visit homepage to get session cookies
try:
session.get("https://www.yelp.com/", timeout=15)
time.sleep(random.uniform(1, 2))
except Exception:
pass
for page_num in range(max_pages):
offset = page_num * 10
url = f"{business_url}?start={offset}&sort_by=date_desc"
print(f"Fetching page {page_num + 1}: offset {offset}")
try:
resp = session.get(url)
except httpx.RequestError as e:
print(f"Request failed: {e}")
break
if resp.status_code == 429:
print("Rate limited — waiting 60s")
time.sleep(60)
continue
if resp.status_code not in (200, 301, 302):
print(f"Got {resp.status_code} — stopping")
break
soup = BeautifulSoup(resp.text, "lxml")
# Check for anti-bot pages
page_title = soup.find("title")
title_text = page_title.get_text() if page_title else ""
if "Access Denied" in title_text or "captcha" in resp.text.lower():
print("Bot detection triggered — stopping or rotate proxy")
break
reviews = parse_reviews_from_html(soup)
if not reviews:
print("No reviews found on this page — may be end of reviews")
break
all_reviews.extend(reviews)
print(f" Found {len(reviews)} reviews (total: {len(all_reviews)})")
# Randomized delay — essential
time.sleep(random.uniform(2.5, 6.0))
return all_reviews
Dealing with Yelp's Anti-Scraping
Yelp is aggressive about blocking scrapers. Here's what you'll face:
CAPTCHA walls — After 20-30 requests from the same IP, Yelp serves a CAPTCHA page instead of content.
Soft bans — Your IP gets flagged and all requests return 503 or redirect to a blocked page for hours.
JavaScript challenges — Some pages require JS execution to render (though most review pages are still server-rendered).
Request fingerprinting — Yelp checks TLS fingerprints, header order, and cookie behavior.
For scraping more than a handful of businesses, residential proxies are essential. Routing Yelp scraping traffic through ThorData's residential proxies provides a large enough IP pool to rotate on every session without repeating IPs.
THORDATA_USER = "your_username"
THORDATA_PASS = "your_password"
def get_proxy(country: str = "us", sticky_id: str = None) -> str:
"""
ThorData proxy URL.
Use sticky_id for same IP across a business scraping session.
"""
import uuid
user = f"{THORDATA_USER}-country-{country}"
if sticky_id:
user += f"-session-{sticky_id}"
return f"http://{user}:{THORDATA_PASS}@proxy.thordata.net:9000"
# Use a fresh sticky session per business
import uuid
def scrape_business_with_proxy(business_url: str, country: str = "us") -> list:
"""Scrape a business using a sticky residential proxy session."""
session_id = str(uuid.uuid4())[:8]
proxy = get_proxy(country=country, sticky_id=session_id)
return scrape_reviews(business_url, max_pages=5, proxy_url=proxy)
Key anti-detection practices:
1. Randomize delays — random.uniform(2.5, 6.0) between pages
2. Rotate User-Agents — keep a pool of 5+ browser strings
3. Don't paginate linearly — shuffle your business list
4. Respect 503s — back off for 5+ minutes on server errors
5. Match proxy country to listing locale — US IPs for US businesses
Extracting Photos
def get_photo_urls(business_url: str, session=None) -> list:
"""Extract photo URLs from a Yelp business photos page."""
if not session:
session = make_scrape_session()
# Remove query params from URL for photos page
base_url = business_url.split("?")[0]
photos_url = f"{base_url}/photos"
resp = session.get(photos_url)
if resp.status_code != 200:
return []
soup = BeautifulSoup(resp.text, "lxml")
photos = []
# Photos are in img tags with bphoto in the src
for img in soup.select("img[src*='bphoto']"):
src = img.get("src", "")
if src:
# Get full-size version by modifying the URL suffix
full_src = re.sub(r"/[a-z]+\.jpg", "/o.jpg", src)
if full_src not in photos:
photos.append(full_src)
# Also check for lazy-loaded images in data-src
for img in soup.select("img[data-src*='bphoto']"):
src = img.get("data-src", "")
if src:
full_src = re.sub(r"/[a-z]+\.jpg", "/o.jpg", src)
if full_src not in photos:
photos.append(full_src)
return photos
Storing Data
import sqlite3
import csv
from datetime import datetime
def setup_yelp_db(db_path: str = "yelp.db") -> sqlite3.Connection:
conn = sqlite3.connect(db_path)
conn.execute("""
CREATE TABLE IF NOT EXISTS businesses (
id TEXT PRIMARY KEY,
name TEXT, rating REAL, review_count INTEGER,
price TEXT, address TEXT, phone TEXT,
categories TEXT, url TEXT, latitude REAL, longitude REAL,
scraped_at TEXT
)
""")
conn.execute("""
CREATE TABLE IF NOT EXISTS reviews (
id INTEGER PRIMARY KEY AUTOINCREMENT,
business_id TEXT, business_url TEXT,
author TEXT, rating INTEGER, date TEXT,
text TEXT, review_id TEXT, source TEXT,
scraped_at TEXT
)
""")
conn.execute("CREATE INDEX IF NOT EXISTS idx_biz ON reviews(business_id)")
conn.commit()
return conn
def save_businesses_to_db(conn, businesses: list):
now = datetime.utcnow().isoformat()
for biz in businesses:
conn.execute("""
INSERT OR REPLACE INTO businesses
(id, name, rating, review_count, price, address, phone,
categories, url, latitude, longitude, scraped_at)
VALUES (?,?,?,?,?,?,?,?,?,?,?,?)
""", (
biz.get("id"),
biz.get("name"),
biz.get("rating"),
biz.get("review_count"),
biz.get("price", ""),
", ".join(biz.get("location", {}).get("display_address", [])),
biz.get("display_phone", ""),
", ".join(c["title"] for c in biz.get("categories", [])),
biz.get("url", ""),
biz.get("coordinates", {}).get("latitude"),
biz.get("coordinates", {}).get("longitude"),
now
))
conn.commit()
def save_reviews_to_db(conn, reviews: list, business_id: str, business_url: str):
now = datetime.utcnow().isoformat()
for r in reviews:
conn.execute("""
INSERT INTO reviews
(business_id, business_url, author, rating, date, text,
review_id, source, scraped_at)
VALUES (?,?,?,?,?,?,?,?,?)
""", (
business_id, business_url,
r.get("author"), r.get("rating"), r.get("date"),
r.get("text"), r.get("review_id"), r.get("source"),
now
))
conn.commit()
def export_to_csv(conn, filename: str = "yelp_reviews.csv"):
cursor = conn.execute("""
SELECT b.name, b.rating as overall_rating, b.price, b.address,
r.author, r.rating as review_rating, r.date, r.text
FROM reviews r
JOIN businesses b ON r.business_id = b.id
ORDER BY b.name, r.date DESC
""")
rows = cursor.fetchall()
headers = ["business_name", "overall_rating", "price", "address",
"reviewer", "review_rating", "date", "review_text"]
with open(filename, "w", newline="", encoding="utf-8") as f:
writer = csv.writer(f)
writer.writerow(headers)
writer.writerows(rows)
print(f"Exported {len(rows)} reviews to {filename}")
Building a Full Dataset
Combining both approaches gives you the most complete dataset:
def build_local_business_dataset(search_term: str, location: str,
limit: int = 50,
scrape_reviews_flag: bool = True) -> str:
"""
Full pipeline:
1. Search businesses via API (clean, fast, no ban risk)
2. Optionally scrape full reviews via HTML
3. Store everything in SQLite
"""
conn = setup_yelp_db()
# Step 1: Get business listings from API
print(f"Searching: {search_term} in {location}...")
businesses = search_businesses(search_term, location, limit=limit)
print(f"Found {len(businesses)} businesses")
save_businesses_to_db(conn, businesses)
if not scrape_reviews_flag:
conn.close()
return "yelp.db"
# Step 2: Scrape full reviews for each business
for i, biz in enumerate(businesses):
print(f"\n[{i+1}/{len(businesses)}] {biz['name']}...")
biz_url = biz.get("url", "").split("?")[0]
if not biz_url:
continue
proxy = get_proxy(country="us", sticky_id=str(uuid.uuid4())[:8])
reviews = scrape_reviews(biz_url, max_pages=3, proxy_url=proxy)
if reviews:
save_reviews_to_db(conn, reviews, biz["id"], biz_url)
print(f" Saved {len(reviews)} reviews")
# Longer delay between businesses
time.sleep(random.uniform(5, 12))
conn.close()
return "yelp.db"
# Run it
db_path = build_local_business_dataset("coffee shops", "San Francisco, CA", limit=20)
print(f"\nDatabase saved to: {db_path}")
Review Sentiment Analysis
from collections import Counter
def analyze_sentiment(reviews: list) -> dict:
"""Basic analysis of review text and ratings."""
ratings = [r["rating"] for r in reviews if r.get("rating")]
all_text = " ".join(r.get("text", "") for r in reviews if r.get("text")).lower()
positive_words = ["excellent", "amazing", "wonderful", "fantastic", "perfect",
"great", "love", "loved", "best", "outstanding", "delicious",
"friendly", "clean", "fresh", "recommend"]
negative_words = ["terrible", "awful", "horrible", "worst", "disgusting",
"dirty", "rude", "slow", "disappointing", "overpriced",
"cold", "stale", "never", "avoid", "bad"]
pos_hits = {w: all_text.count(w) for w in positive_words if all_text.count(w) > 0}
neg_hits = {w: all_text.count(w) for w in negative_words if all_text.count(w) > 0}
dist = Counter(ratings)
avg = sum(r * c for r, c in dist.items()) / sum(dist.values()) if dist else 0
return {
"total_reviews": len(reviews),
"average_rating": round(avg, 2),
"rating_distribution": dict(sorted(dist.items())),
"top_positive_words": sorted(pos_hits.items(), key=lambda x: -x[1])[:5],
"top_negative_words": sorted(neg_hits.items(), key=lambda x: -x[1])[:5],
"positive_mentions": sum(pos_hits.values()),
"negative_mentions": sum(neg_hits.values()),
}
Legal Considerations
Yelp's Terms of Service prohibit scraping. The Fusion API is the officially sanctioned data access path. Several court cases (hiQ v. LinkedIn being the landmark) have established that scraping public data isn't a violation of the CFAA, but Yelp has pursued scrapers under state contract law.
Guidelines for staying safe: - Use the API where it works — it gives you business data without risk - Scrape only what the API can't provide (full review text) - Don't redistribute raw review text commercially - Don't pretend to be Yelp or imply affiliation - For commercial products built on Yelp review data — consult a lawyer
Use the API where it works. Scrape only what the API can't provide. And don't hammer their servers — slow and respectful scraping is less likely to trigger bans and less likely to create legal exposure.