Scraping TripAdvisor Reviews and Business Data (2026)
Scraping TripAdvisor Reviews and Business Data (2026)
TripAdvisor is one of the richest sources of business reviews on the web — restaurants, hotels, attractions, all with detailed ratings, review text, and metadata. Over 1 billion reviews across 8 million businesses. Here's how to scrape it programmatically in 2026.
Understanding TripAdvisor URLs
Every listing follows a predictable pattern:
https://www.tripadvisor.com/Restaurant_Review-g187147-d1751525-Reviews-Le_Cinq-Paris.html
The key parts: g187147 is the geo ID (Paris), d1751525 is the business ID. For pagination, TripAdvisor appends -or10-, -or20-, etc. before the business name — each page shows 10 reviews.
/Restaurant_Review-g187147-d1751525-Reviews-or10-Le_Cinq-Paris.html # page 2
/Restaurant_Review-g187147-d1751525-Reviews-or20-Le_Cinq-Paris.html # page 3
Hotels use Hotel_Review, attractions use Attraction_Review. Same structure.
You can also find geo IDs programmatically by scraping TripAdvisor's search TypeAhead endpoint:
import httpx
from urllib.parse import quote
def find_geo_id(city: str) -> dict | None:
"""Find TripAdvisor geo ID and metadata for a city."""
url = "https://www.tripadvisor.com/TypeAheadJson"
params = {
"action": "API",
"types": "geo",
"query": city,
"max": "5",
"lang": "en_US",
}
headers = {
"User-Agent": (
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
"AppleWebKit/537.36"
)
}
resp = httpx.get(url, params=params, headers=headers, timeout=15)
results = resp.json().get("results", [])
if not results:
return None
best = results[0]
return {
"value": best.get("value"), # e.g., "g187147"
"name": best.get("display_name"),
"type": best.get("type"),
}
# Example
paris = find_geo_id("Paris, France")
print(f"Paris geo ID: {paris['value']}")
JSON-LD: The Easy Win
Before touching the DOM, check the page source for JSON-LD structured data. TripAdvisor embeds a Restaurant or Hotel schema object with aggregate ratings that is far more stable than CSS class selectors:
from playwright.async_api import async_playwright
import json
import asyncio
async def extract_business_json_ld(url: str, proxy: dict = None) -> dict:
"""Extract structured business data from TripAdvisor JSON-LD."""
async with async_playwright() as p:
launch_kwargs = {"headless": True}
if proxy:
launch_kwargs["proxy"] = proxy
browser = await p.chromium.launch(**launch_kwargs)
context = await browser.new_context(
user_agent=(
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
"AppleWebKit/537.36 Chrome/122.0.0.0 Safari/537.36"
),
viewport={"width": 1440, "height": 900},
locale="en-US",
)
page = await context.new_page()
await page.goto(url, wait_until="domcontentloaded", timeout=30000)
scripts = await page.query_selector_all(
'script[type="application/ld+json"]'
)
business = {}
for script in scripts:
try:
text = await script.inner_text()
data = json.loads(text)
target_types = (
"Restaurant", "Hotel", "TouristAttraction",
"LodgingBusiness", "FoodEstablishment"
)
if data.get("@type") in target_types:
agg = data.get("aggregateRating", {})
address = data.get("address", {})
business = {
"name": data.get("name"),
"type": data.get("@type"),
"address": address.get("streetAddress"),
"city": address.get("addressLocality"),
"country": address.get("addressCountry"),
"price_range": data.get("priceRange"),
"cuisine": [
c.get("name") if isinstance(c, dict) else c
for c in data.get("servesCuisine", [])
],
"rating": agg.get("ratingValue"),
"review_count": agg.get("reviewCount"),
"best_rating": agg.get("bestRating"),
"telephone": data.get("telephone"),
"url": url,
"image": (
data.get("image", [None])[0]
if isinstance(data.get("image"), list)
else data.get("image")
),
}
break
except (json.JSONDecodeError, AttributeError):
continue
await browser.close()
return business
This gives you name, address, price range, aggregateRating (value + review count), cuisine type, and contact info — all without parsing unstable HTML.
Scraping Full Reviews with Playwright
Reviews lazy-load and truncate behind "Read more" buttons. You need a real browser for the full text:
async def scrape_reviews(url: str, max_pages: int = 5,
proxy: dict = None) -> list[dict]:
"""Scrape all reviews from a TripAdvisor listing with pagination."""
async with async_playwright() as p:
launch_kwargs = {"headless": True}
if proxy:
launch_kwargs["proxy"] = proxy
browser = await p.chromium.launch(**launch_kwargs)
context = await browser.new_context(
user_agent=(
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
"AppleWebKit/537.36 Chrome/122.0.0.0 Safari/537.36"
),
viewport={"width": 1440, "height": 900},
)
page = await context.new_page()
reviews = []
for i in range(max_pages):
# Build page URL with pagination offset
if i == 0:
page_url = url
else:
# Insert pagination offset before the last slug
parts = url.split("-Reviews-")
if len(parts) == 2:
page_url = (f"{parts[0]}-Reviews-"
f"or{i * 10}-{parts[1]}")
else:
break
await page.goto(page_url, wait_until="networkidle",
timeout=30000)
# Expand truncated reviews — critical for full text
try:
read_more = page.locator(
'[data-automation="reviewReadMore"], '
'button[class*="read-more"]'
)
for btn in await read_more.all():
try:
await btn.click(timeout=2000)
except Exception:
pass
await page.wait_for_timeout(500)
except Exception:
pass
# Extract review cards
cards = page.locator(
'[data-automation="reviewCard"], '
'[class*="reviewCard"], '
'[class*="review-container"]'
)
page_reviews = []
for card in await cards.all():
try:
title_el = card.locator(
'[data-automation="reviewTitle"]'
)
text_el = card.locator(
'[data-automation="reviewText"]'
)
rating_el = card.locator('svg[aria-label]')
date_el = card.locator(
'.cRVSd, [class*="reviewDate"]'
)
reviewer_el = card.locator(
'[class*="memberName"], '
'[data-automation="profileName"]'
)
trip_type_el = card.locator(
'[data-automation="tripType"]'
)
title = (await title_el.inner_text()
if await title_el.count() else "")
text = (await text_el.inner_text()
if await text_el.count() else "")
aria = (await rating_el.get_attribute("aria-label")
if await rating_el.count() else "")
date = (await date_el.inner_text()
if await date_el.count() else "")
reviewer = (await reviewer_el.inner_text()
if await reviewer_el.count() else "")
trip_type = (await trip_type_el.inner_text()
if await trip_type_el.count() else "")
# Parse rating from aria label: "5 of 5 bubbles"
rating_value = None
if aria and "of 5" in aria:
try:
rating_value = float(aria.split(" of ")[0])
except ValueError:
pass
if text.strip():
page_reviews.append({
"title": title.strip(),
"text": text.strip(),
"rating": rating_value,
"date": date.strip(),
"reviewer": reviewer.strip(),
"trip_type": trip_type.strip(),
})
except Exception:
continue
reviews.extend(page_reviews)
print(f" Page {i + 1}: {len(page_reviews)} reviews "
f"(total: {len(reviews)})")
if len(page_reviews) < 8: # Last page has fewer
break
await asyncio.sleep(2.5)
await browser.close()
return reviews
TripAdvisor uses data-automation attributes on most interactive elements, which is more reliable than class-based selectors that change with redesigns. The "Read more" click is essential — without it you only get the first ~200 characters of each review.
Extracting Reviewer Metadata
When available, reviewer profiles add useful context — local vs tourist, review history, etc.:
async def scrape_review_with_metadata(card_element) -> dict:
"""Extract extended metadata from a single review card."""
review = {}
# Basic review content
for selector, key in [
('[data-automation="reviewTitle"]', "title"),
('[data-automation="reviewText"]', "text"),
('.cRVSd', "date"),
('[class*="memberName"]', "reviewer_name"),
]:
el = card_element.locator(selector)
if await el.count():
review[key] = (await el.inner_text()).strip()
# Rating
rating_el = card_element.locator('svg[aria-label*="of 5"]')
if await rating_el.count():
aria = await rating_el.get_attribute("aria-label")
try:
review["rating"] = float(aria.split(" of ")[0])
except (ValueError, AttributeError):
review["rating"] = None
# Reviewer's location
location_el = card_element.locator('[class*="reviewerInfo"] span')
locations = []
for el in await location_el.all():
text = (await el.inner_text()).strip()
if text and len(text) > 2:
locations.append(text)
review["reviewer_location"] = locations[0] if locations else None
# Reviewer contribution count
contrib_el = card_element.locator('[class*="contributions"]')
if await contrib_el.count():
text = await contrib_el.inner_text()
import re
nums = re.findall(r"\d+", text)
review["reviewer_contributions"] = int(nums[0]) if nums else None
# Trip type (solo, family, couple, etc.)
trip_el = card_element.locator('[data-automation="tripType"]')
if await trip_el.count():
review["trip_type"] = (await trip_el.inner_text()).strip()
return review
Complete Pipeline: Business Data + Reviews + Storage
import sqlite3
def create_db(db_path: str = "tripadvisor_data.db") -> sqlite3.Connection:
"""Initialize SQLite database for TripAdvisor data."""
conn = sqlite3.connect(db_path)
conn.executescript("""
CREATE TABLE IF NOT EXISTS businesses (
id INTEGER PRIMARY KEY AUTOINCREMENT,
name TEXT,
business_type TEXT,
address TEXT,
city TEXT,
country TEXT,
price_range TEXT,
cuisine TEXT,
rating REAL,
review_count INTEGER,
telephone TEXT,
url TEXT UNIQUE,
scraped_at TEXT DEFAULT (datetime('now'))
);
CREATE TABLE IF NOT EXISTS reviews (
id INTEGER PRIMARY KEY AUTOINCREMENT,
business_url TEXT,
title TEXT,
text TEXT,
rating REAL,
date TEXT,
reviewer_name TEXT,
reviewer_location TEXT,
reviewer_contributions INTEGER,
trip_type TEXT,
scraped_at TEXT DEFAULT (datetime('now')),
FOREIGN KEY (business_url) REFERENCES businesses(url)
);
CREATE INDEX IF NOT EXISTS idx_reviews_url
ON reviews(business_url);
CREATE INDEX IF NOT EXISTS idx_reviews_rating
ON reviews(rating);
CREATE INDEX IF NOT EXISTS idx_biz_city
ON businesses(city);
""")
conn.commit()
return conn
async def full_pipeline(urls: list[str],
db_path: str = "tripadvisor_data.db",
proxy_config: dict = None,
max_review_pages: int = 10) -> None:
"""Scrape business data and reviews for multiple listings."""
conn = create_db(db_path)
for url in urls:
print(f"\nScraping: {url}")
# Step 1: Extract business metadata from JSON-LD
biz = await extract_business_json_ld(url, proxy=proxy_config)
if biz:
conn.execute("""
INSERT OR REPLACE INTO businesses
(name, business_type, address, city, country,
price_range, cuisine, rating, review_count,
telephone, url)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
""", (
biz.get("name"), biz.get("type"),
biz.get("address"), biz.get("city"),
biz.get("country"), biz.get("price_range"),
json.dumps(biz.get("cuisine", [])),
biz.get("rating"), biz.get("review_count"),
biz.get("telephone"), url,
))
conn.commit()
print(f" {biz.get('name')} — {biz.get('rating')}/5 "
f"({biz.get('review_count', 0)} reviews)")
# Step 2: Scrape full review text with pagination
reviews = await scrape_reviews(
url, max_pages=max_review_pages, proxy=proxy_config
)
for review in reviews:
conn.execute("""
INSERT INTO reviews
(business_url, title, text, rating, date,
reviewer_name, trip_type)
VALUES (?, ?, ?, ?, ?, ?, ?)
""", (
url,
review.get("title"), review.get("text"),
review.get("rating"), review.get("date"),
review.get("reviewer"), review.get("trip_type"),
))
conn.commit()
print(f" Saved {len(reviews)} reviews")
# Pause between businesses
await asyncio.sleep(random.uniform(5, 10))
conn.close()
print("\nPipeline complete.")
Handling TripAdvisor's Anti-Bot Defenses
TripAdvisor aggressively blocks datacenter IPs. After 5-10 requests from the same IP, you'll hit CAPTCHAs or 403s. Residential proxies are non-negotiable.
ThorData provides residential proxies across 195+ countries. Their pool rotates IPs per request, which keeps TripAdvisor from fingerprinting your scraper. City-level targeting helps — if you're scraping Paris restaurants, use French IPs to match the expected traffic pattern.
PROXY_USER = "your_user"
PROXY_PASS = "your_pass"
PROXY_HOST = "proxy.thordata.com"
PROXY_PORT = 9000
def get_proxy_config(country: str = None) -> dict:
"""Build a Playwright proxy config dict."""
url = (
f"http://{PROXY_USER}:{PROXY_PASS}"
f"@{PROXY_HOST}:{PROXY_PORT}"
)
if country:
url += f"?country={country}"
return {
"server": f"http://{PROXY_HOST}:{PROXY_PORT}",
"username": PROXY_USER,
"password": PROXY_PASS,
}
# Usage with full pipeline
proxy = get_proxy_config(country="fr") # French IPs for Paris listings
asyncio.run(full_pipeline(
urls=["https://www.tripadvisor.com/Restaurant_Review-g187147-d1751525-..."],
proxy_config=proxy,
max_review_pages=5,
))
Also rotate user agents and randomize viewport sizes. TripAdvisor checks the full browser fingerprint — not just IP.
Data Analysis Examples
Once you have reviews in SQLite, you can run useful queries:
def analyze_reviews(conn: sqlite3.Connection,
business_url: str = None) -> None:
"""Print summary statistics for collected review data."""
where = ""
params = ()
if business_url:
where = "WHERE r.business_url = ?"
params = (business_url,)
# Rating distribution
print("Rating distribution:")
for row in conn.execute(f"""
SELECT CAST(rating AS INT) as stars, COUNT(*) as count
FROM reviews r {where}
WHERE rating IS NOT NULL
GROUP BY stars
ORDER BY stars DESC
""", params):
bar = "█" * (row[1] // 5)
print(f" {row[0]}★: {row[1]:5} {bar}")
# Common words in negative reviews
print("\nNegative reviews (1-2 stars) sample:")
for row in conn.execute(f"""
SELECT text FROM reviews r {where}
{"AND" if where else "WHERE"} rating <= 2
ORDER BY RANDOM() LIMIT 5
""", params):
print(f" {row[0][:120]}...")
# Trip type breakdown
print("\nTrip type distribution:")
for row in conn.execute(f"""
SELECT trip_type, COUNT(*) as count,
AVG(rating) as avg_rating
FROM reviews r {where}
WHERE trip_type != ''
GROUP BY trip_type
ORDER BY count DESC
""", params):
print(f" {row[0] or 'Unknown':20}: "
f"{row[1]:5} reviews, "
f"{row[2]:.1f}★ avg")
# Monthly review volume (detect trends)
print("\nMonthly review volume:")
for row in conn.execute(f"""
SELECT SUBSTR(date, 1, 7) as month,
COUNT(*) as count,
AVG(rating) as avg_rating
FROM reviews r {where}
WHERE date != ''
GROUP BY month
ORDER BY month DESC
LIMIT 12
""", params):
print(f" {row[0]}: {row[1]:4} reviews, {row[2]:.1f}★ avg")
def find_sentiment_patterns(conn: sqlite3.Connection,
negative_keywords: list[str] = None) -> None:
"""Find common complaint patterns in low-rated reviews."""
if not negative_keywords:
negative_keywords = [
"slow", "rude", "dirty", "cold", "wait",
"overpriced", "noisy", "small", "wrong", "missing",
]
print("Complaint keyword frequency in 1-2 star reviews:\n")
for keyword in negative_keywords:
count = conn.execute("""
SELECT COUNT(*) FROM reviews
WHERE rating <= 2
AND LOWER(text) LIKE ?
""", (f"%{keyword}%",)).fetchone()[0]
if count > 0:
print(f" '{keyword}': {count} reviews")
Business Use Cases
Hospitality Competitive Analysis
Hotels and restaurants can monitor competitor reviews to identify service gaps. If a competing hotel consistently gets complaints about slow check-in, that's an opportunity to differentiate. Scrape review text and run basic sentiment analysis to spot patterns.
Local SEO and Reputation Monitoring
Agencies managing multiple restaurant or hotel clients can automate review monitoring. Scrape new reviews daily, flag negative ones for immediate response, and track rating trends over time. Faster response to negative reviews improves overall scores.
Travel Content and Recommendations
Travel blogs and apps can aggregate top-rated businesses by city. Scrape ratings and review counts for restaurants in a target city, filter by cuisine type, and build curated "best of" lists backed by real data rather than subjective picks.
Market Entry Research
Before opening a restaurant or hotel in a new city, scrape competitor data: average ratings, review volume, price ranges, common complaints. This gives you a data-driven understanding of the local market before committing capital.
Review Authenticity Analysis
Compare review patterns across businesses to detect fake reviews. Suspicious signals: review clusters on specific dates, reviewers with only one review, identical phrasing across reviews, sudden rating spikes. Build a simple scoring model from scraped data.
Legal Considerations
TripAdvisor's terms prohibit scraping, but their data is publicly accessible and factual in nature. The hiQ v. LinkedIn precedent (2022) established that scraping public data does not violate the CFAA under US law. That said:
- Don't log in to access private data
- Don't overload their servers (keep request rates to 10-15 second delays)
- Don't republish review text verbatim as your own content
- Use scraped data for analysis, not wholesale reproduction
- Commercial use cases should get legal review in your jurisdiction
The safest approach: scrape for internal analysis, aggregate statistics, and research. Don't build a competing review site with their data.
See also: Scraping TripAdvisor Attractions | How to Scrape Etsy Listings | Residential vs Datacenter Proxies
NLP Analysis on Review Text
Once you have thousands of reviews in SQLite, natural language processing unlocks deeper insights:
def extract_aspects_from_reviews(conn: sqlite3.Connection,
business_url: str) -> dict:
"""
Basic aspect-based sentiment analysis on review text.
Identifies specific service aspects mentioned in reviews.
"""
aspect_keywords = {
"food": ["food", "dish", "meal", "menu", "taste", "flavor",
"cuisine", "delicious", "bland", "fresh", "portion"],
"service": ["service", "staff", "waiter", "server", "friendly",
"rude", "attentive", "slow", "fast", "polite"],
"atmosphere": ["atmosphere", "ambiance", "decor", "interior",
"noisy", "quiet", "romantic", "cozy", "busy"],
"price": ["price", "expensive", "cheap", "worth", "value",
"overpriced", "affordable", "cost"],
"location": ["location", "parking", "street", "area",
"accessible", "central", "remote"],
}
aspect_ratings = {aspect: [] for aspect in aspect_keywords}
for row in conn.execute("""
SELECT text, rating FROM reviews
WHERE business_url = ? AND rating IS NOT NULL
""", (business_url,)):
text_lower = row[0].lower() if row[0] else ""
rating = row[1]
for aspect, keywords in aspect_keywords.items():
if any(kw in text_lower for kw in keywords):
aspect_ratings[aspect].append(rating)
results = {}
for aspect, ratings in aspect_ratings.items():
if ratings:
results[aspect] = {
"mentions": len(ratings),
"avg_rating": sum(ratings) / len(ratings),
"positive": sum(1 for r in ratings if r >= 4),
"negative": sum(1 for r in ratings if r <= 2),
}
return results
def find_trending_topics(conn: sqlite3.Connection,
recent_days: int = 30) -> list[tuple]:
"""Find topics more common in recent reviews vs older ones."""
import re
from collections import Counter
from datetime import datetime, timedelta
cutoff = (datetime.utcnow() - timedelta(days=recent_days)).isoformat()
recent_words = Counter()
older_words = Counter()
for row in conn.execute("""
SELECT text, scraped_at FROM reviews
WHERE text IS NOT NULL
"""):
words = re.findall(r'\b[a-z]{4,}\b', (row[0] or "").lower())
if row[1] and row[1] > cutoff:
recent_words.update(words)
else:
older_words.update(words)
# Find words that appear more in recent reviews
stopwords = {
"this", "that", "with", "have", "been", "were",
"from", "they", "will", "very", "just", "also",
"good", "great", "nice", "best", "more", "very",
}
trending = []
for word, count in recent_words.most_common(200):
if word in stopwords or count < 5:
continue
older_count = older_words.get(word, 0)
if older_count > 0:
growth = (count - older_count) / older_count
if growth > 0.5: # 50%+ increase
trending.append((word, count, growth))
return sorted(trending, key=lambda x: -x[2])[:20]
Automating Review Collection with a Scheduler
For ongoing review monitoring, schedule the scraper to run daily:
import schedule
import time as time_module
def daily_review_job(urls: list[str],
db_path: str = "tripadvisor_data.db") -> None:
"""Run the review collection pipeline as a scheduled job."""
print(f"Starting daily review collection for {len(urls)} businesses...")
proxy = get_proxy_config(country="us")
try:
asyncio.run(full_pipeline(
urls=urls,
db_path=db_path,
proxy_config=proxy,
max_review_pages=3, # Only latest reviews
))
print("Daily collection complete.")
except Exception as e:
print(f"Collection error: {e}")
# Define your monitored business URLs
MONITORED = [
"https://www.tripadvisor.com/Restaurant_Review-...",
"https://www.tripadvisor.com/Hotel_Review-...",
]
# Schedule to run at 3 AM daily
schedule.every().day.at("03:00").do(daily_review_job, urls=MONITORED)
if __name__ == "__main__":
print("Review monitor started. Press Ctrl+C to stop.")
while True:
schedule.run_pending()
time_module.sleep(60)
This creates a lightweight monitoring system that collects new reviews nightly without manual intervention. Combined with email alerts for negative reviews (rating ≤ 2), it's a functional reputation monitoring product that hospitality businesses will pay for.