Scraping TripAdvisor Attractions and Tours with Python (2026)
Scraping TripAdvisor Attractions and Tours with Python (2026)
TripAdvisor is more than hotel reviews. Their "Things to Do" section covers tourist attractions, guided tours, day trips, and experiences — a category that's grown massively as they compete with GetYourGuide and Viator (which TripAdvisor actually owns). This data is valuable for travel startups, tour operators doing competitive research, or anyone building location-based recommendation systems.
This guide focuses specifically on scraping attractions and tours — not hotels or restaurants, which have different page structures and heavier protections.
Why Attractions Data Is Valuable
The travel industry generated over $1.5 trillion globally in 2026, with experiences and tours accounting for a fast-growing share. TripAdvisor's attraction data is used in several commercial contexts:
- Travel aggregators — Build destination guides that surface the highest-rated things to do in any city, backed by real review data
- Tour operator research — Identify gaps in the local tour market by finding attraction types with high view counts but few tour offerings
- Local SEO and promotion — Tourism boards use scraped ratings to benchmark local attractions against competing destinations
- Price comparison — Aggregating tour prices across TripAdvisor, Viator, and GetYourGuide reveals pricing patterns and competitive positioning
- Recommendation systems — "You visited Central Park, you might like..." models require comprehensive attraction metadata
- Academic tourism research — Study how review counts and ratings influence visitor behavior and destination popularity
Page Structure
TripAdvisor attraction pages follow a predictable URL pattern:
- City attractions list:
/Attractions-g60763-Activities-New_York_City_New_York.html - Specific attraction:
/Attraction_Review-g60763-d104365-Reviews-Central_Park-New_York_City_New_York.html - Tours/experiences:
/AttractionProductReview-g60763-d12345-... - Category filtered:
/Attractions-g60763-Activities-c47-New_York_City_New_York.html(c47 = outdoor activities)
The g code is the geo ID (city), and d code is the attraction ID. These IDs are stable — they don't change when TripAdvisor redesigns their frontend.
Common Category Codes
| Category | Code |
|---|---|
| Outdoor Activities | c47 |
| Tours | c42 |
| Museums | c49 |
| Nightlife | c20 |
| Spas & Wellness | c72 |
| Food & Drink | c36 |
| Shopping | c26 |
Setup
pip install httpx beautifulsoup4 lxml
TripAdvisor renders attraction listings server-side (for SEO), so we can start with plain HTTP requests. We'll escalate to Playwright only where needed.
Scraping Attraction Listings
import httpx
import json
import re
import time
import random
from bs4 import BeautifulSoup
HEADERS = {
"User-Agent": (
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
"AppleWebKit/537.36 Chrome/126.0.0.0 Safari/537.36"
),
"Accept": "text/html,application/xhtml+xml",
"Accept-Language": "en-US,en;q=0.9",
"Accept-Encoding": "gzip, deflate, br",
}
def create_session(proxy_url: str = None) -> httpx.Client:
"""Create an HTTP session, optionally with a proxy."""
client = httpx.Client(
headers=HEADERS,
proxy=proxy_url,
timeout=20,
follow_redirects=True,
)
# Warm up session with homepage
client.get("https://www.tripadvisor.com/")
time.sleep(random.uniform(2, 4))
return client
def scrape_attractions(geo_id: str, city_slug: str,
offset: int = 0,
category: str = None,
session: httpx.Client = None) -> dict:
"""
Scrape attraction listings for a city.
geo_id: e.g., 'g60763' for NYC
city_slug: e.g., 'New_York_City_New_York'
offset: pagination offset (increments of 30)
category: optional category code, e.g., 'c47' for outdoor
"""
if category:
url = (f"https://www.tripadvisor.com/Attractions-{geo_id}"
f"-Activities-{category}-oa{offset}-{city_slug}.html")
else:
url = (f"https://www.tripadvisor.com/Attractions-{geo_id}"
f"-Activities-oa{offset}-{city_slug}.html")
client = session or httpx.Client(headers=HEADERS, timeout=20,
follow_redirects=True)
response = client.get(url)
response.raise_for_status()
soup = BeautifulSoup(response.text, "lxml")
attractions = []
# Try JSON-LD structured data first (most reliable)
scripts = soup.find_all("script", type="application/ld+json")
for script in scripts:
try:
data = json.loads(script.string)
if isinstance(data, list):
for item in data:
if item.get("@type") in (
"TouristAttraction", "LandmarksOrHistoricalBuildings",
"Museum", "Park"
):
agg = item.get("aggregateRating", {})
attractions.append({
"name": item.get("name"),
"url": item.get("url"),
"rating": agg.get("ratingValue"),
"review_count": agg.get("reviewCount"),
"address": item.get("address", {}),
"description": item.get("description", "")[:300],
"image": item.get("image"),
"source": "json_ld",
})
elif isinstance(data, dict):
if data.get("@type") in (
"TouristAttraction", "ItemList"
):
if "itemListElement" in data:
for el in data["itemListElement"]:
item = el.get("item", el)
agg = item.get("aggregateRating", {})
attractions.append({
"name": item.get("name"),
"url": item.get("url"),
"rating": agg.get("ratingValue"),
"review_count": agg.get("reviewCount"),
"source": "json_ld_list",
})
except (json.JSONDecodeError, AttributeError):
continue
# HTML card fallback if JSON-LD is sparse
if len(attractions) < 5:
cards = soup.select(
"[data-automation='cardTitle'], "
"div[class*='attraction_element'], "
"[class*='listItem']"
)
for card in cards:
link = card.find(
"a", href=re.compile(r"Attraction_Review")
)
if not link:
continue
name = link.get_text(strip=True)
href = link.get("href", "")
if not name or not href:
continue
# Extract geo/attraction IDs from URL
id_match = re.search(r"-d(\d+)-", href)
attr_id = id_match.group(1) if id_match else None
rating_el = card.select_one(
"[class*='bubble_rating'], "
"svg[aria-label*='bubble'], "
"[aria-label*=' of 5']"
)
rating = None
if rating_el:
aria = rating_el.get("aria-label", "")
match = re.search(r"([\d.]+)\s*of", aria)
if match:
rating = float(match.group(1))
review_el = card.select_one(
"[class*='reviewCount'], "
"[class*='review_count']"
)
review_count = None
if review_el:
text = review_el.get_text(strip=True)
nums = re.findall(r"[\d,]+", text)
if nums:
review_count = int(nums[0].replace(",", ""))
attractions.append({
"name": name,
"attraction_id": attr_id,
"url": f"https://www.tripadvisor.com{href}",
"rating": rating,
"review_count": review_count,
"source": "html",
})
has_next = bool(soup.select_one(
"a[class*='next'], a[data-offset], "
"[data-automation='paginationNextLink']"
))
return {
"attractions": attractions,
"offset": offset,
"has_next": has_next,
"city_slug": city_slug,
"geo_id": geo_id,
}
Scraping Individual Attraction Details
Each attraction page has detailed info — description, hours, pricing, photos, and reviews:
def scrape_attraction_detail(url: str,
session: httpx.Client = None) -> dict:
"""Scrape a single attraction's detail page."""
client = session or httpx.Client(headers=HEADERS, timeout=20,
follow_redirects=True)
response = client.get(url)
response.raise_for_status()
soup = BeautifulSoup(response.text, "lxml")
detail = {"url": url}
# Name
h1 = soup.find("h1")
detail["name"] = h1.get_text(strip=True) if h1 else ""
# Try JSON-LD for structured data
for script in soup.find_all("script", type="application/ld+json"):
try:
data = json.loads(script.string)
if isinstance(data, dict) and data.get("@type") in (
"TouristAttraction", "Museum", "Park", "LandmarksOrHistoricalBuildings"
):
detail["description"] = data.get("description", "")[:500]
detail["address"] = data.get("address", {})
detail["telephone"] = data.get("telephone")
detail["website"] = data.get("url")
geo = data.get("geo", {})
if geo:
detail["latitude"] = geo.get("latitude")
detail["longitude"] = geo.get("longitude")
agg = data.get("aggregateRating", {})
detail["rating"] = agg.get("ratingValue")
detail["review_count"] = agg.get("reviewCount")
break
except (json.JSONDecodeError, AttributeError):
continue
# DOM fallback for description
if not detail.get("description"):
about = soup.select_one(
"[class*='about'], [data-automation='about'], "
"[class*='description']"
)
if about:
detail["description"] = about.get_text(
strip=True, separator=" "
)[:500]
# Categories/tags
tag_els = soup.select(
"[class*='tag'] a, [class*='category'] a, "
"[data-automation='tag']"
)
detail["categories"] = list(set(
t.get_text(strip=True) for t in tag_els
if t.get_text(strip=True)
))
# Opening hours
hours_el = soup.select_one(
"[class*='hours'], [data-automation='hours'], "
"[class*='openHours']"
)
if hours_el:
detail["hours"] = hours_el.get_text(
strip=True, separator=" | "
)[:300]
# Price range
price_el = soup.select_one(
"[class*='price'], [data-automation='price'], "
"[class*='admission']"
)
if price_el:
detail["price"] = price_el.get_text(strip=True)[:100]
# Suggested visit duration
duration_el = soup.select_one(
"[class*='duration'], [data-automation='duration']"
)
if duration_el:
detail["suggested_duration"] = duration_el.get_text(strip=True)
# Nearby attractions
nearby_links = soup.select(
"[class*='nearby'] a[href*='Attraction_Review'], "
"[class*='seeNearby'] a"
)
detail["nearby"] = [
{
"name": n.get_text(strip=True),
"url": f"https://www.tripadvisor.com{n.get('href', '')}"
if n.get("href", "").startswith("/") else n.get("href"),
}
for n in nearby_links[:5]
if n.get_text(strip=True)
]
return detail
Scraping Tours and Experiences
Tours are listed separately and have booking data — prices, availability, and duration:
def scrape_tours(geo_id: str, city_slug: str,
session: httpx.Client = None) -> list[dict]:
"""Scrape tours and experiences listings for a city."""
url = (f"https://www.tripadvisor.com/Attractions-{geo_id}"
f"-Activities-c42-{city_slug}.html")
client = session or httpx.Client(headers=HEADERS, timeout=20,
follow_redirects=True)
response = client.get(url)
response.raise_for_status()
soup = BeautifulSoup(response.text, "lxml")
tours = []
tour_cards = soup.select(
"[class*='product-card'], [data-automation*='product'], "
"[class*='experienceCard']"
)
for card in tour_cards:
title_el = card.select_one(
"a[href*='AttractionProductReview'], "
"a[href*='Attraction_Review']"
)
if not title_el:
continue
tour = {
"name": title_el.get_text(strip=True),
"url": title_el.get("href", ""),
}
if tour["url"].startswith("/"):
tour["url"] = f"https://www.tripadvisor.com{tour['url']}"
# Price
price_el = card.select_one(
"[class*='price'], [data-automation*='price']"
)
if price_el:
tour["price"] = price_el.get_text(strip=True)
# Duration
dur_el = card.select_one(
"[class*='duration'], [data-automation*='duration']"
)
if dur_el:
tour["duration"] = dur_el.get_text(strip=True)
# Rating
rating_el = card.select_one(
"[class*='rating'], [aria-label*='bubble']"
)
if rating_el:
aria = rating_el.get("aria-label", "")
match = re.search(r"([\d.]+)", aria)
tour["rating"] = float(match.group(1)) if match else None
# Review count
review_el = card.select_one("[class*='reviewCount']")
if review_el:
nums = re.findall(r"[\d,]+", review_el.get_text())
tour["review_count"] = int(nums[0].replace(",", "")) if nums else None
# Category/type label
type_el = card.select_one(
"[class*='category'], [class*='tourType']"
)
if type_el:
tour["tour_type"] = type_el.get_text(strip=True)
tours.append(tour)
return tours
TripAdvisor Anti-Bot Defenses
TripAdvisor has some of the most aggressive anti-scraping measures in the travel industry. Here's what you're up against:
- Datadome — their primary bot detection. Analyzes browser fingerprints, mouse movement patterns, and request timing. Responses that fail Datadome checks return a 403 with a Datadome challenge page.
- Dynamic class names — CSS classes are hashed and change frequently. Don't rely on exact class names; use partial matches (
[class*='something']) and data attributes. - IP reputation scoring — they maintain blocklists and score IPs based on behavior history.
- Request fingerprinting — header order, TLS fingerprint, and HTTP/2 settings are all checked.
- Session tracking — TripAdvisor correlates request sequences; scrapers that never visit other pages look different from human browsing patterns.
What actually works:
- Residential proxies — Datacenter IPs are blocked almost instantly. For any meaningful TripAdvisor scraping, you need residential IPs from ThorData or similar providers that offer clean, city-targeted residential pools. Since TripAdvisor serves location-specific content, matching your proxy location to the city you're scraping gives more natural-looking traffic.
- Session warm-up — Visit the homepage before scraping. Collect cookies. Maintain the session across requests.
- Slow and steady — 10-20 seconds between requests. TripAdvisor watches for machine-speed browsing.
- Rotate User-Agents across sessions, but keep each session consistent.
def create_session_with_warmup(proxy_url: str = None) -> httpx.Client:
"""Create and warm up a TripAdvisor scraping session."""
client = httpx.Client(
headers=HEADERS,
proxy=proxy_url,
timeout=20,
follow_redirects=True,
)
# Step 1: Hit the homepage to get cookies
resp = client.get("https://www.tripadvisor.com/")
time.sleep(random.uniform(2, 4))
# Step 2: Browse to the target city's overview (natural behavior)
city_url = "https://www.tripadvisor.com/Tourism-g60763-New_York_City.html"
client.get(city_url)
time.sleep(random.uniform(3, 6))
return client
Paginating Through All Attractions
TripAdvisor uses offset-based pagination. Each page shows 30 results:
def get_all_attractions(geo_id: str, city_slug: str,
max_pages: int = 10,
session: httpx.Client = None,
category: str = None) -> list[dict]:
"""Paginate through all attraction listings for a city."""
all_attractions = []
for page in range(max_pages):
offset = page * 30
print(f"Page {page + 1} (offset {offset})...")
try:
result = scrape_attractions(
geo_id, city_slug, offset,
category=category, session=session
)
new_attractions = result["attractions"]
all_attractions.extend(new_attractions)
print(f" Got {len(new_attractions)} attractions "
f"(total: {len(all_attractions)})")
if not result["has_next"] or not new_attractions:
print(" No more pages.")
break
except httpx.HTTPStatusError as e:
print(f" HTTP error {e.response.status_code} — stopping")
break
except Exception as e:
print(f" Error: {e}")
break
time.sleep(random.uniform(10, 18))
print(f"Total: {len(all_attractions)} attractions")
return all_attractions
Building a Travel Data Pipeline
Combine attraction listings with detail pages to build a comprehensive dataset:
import sqlite3
def init_db(path: str = "tripadvisor_attractions.db") -> sqlite3.Connection:
"""Initialize database for TripAdvisor attractions data."""
conn = sqlite3.connect(path)
conn.executescript("""
CREATE TABLE IF NOT EXISTS attractions (
url TEXT PRIMARY KEY,
attraction_id TEXT,
name TEXT,
geo_id TEXT,
city_slug TEXT,
rating REAL,
review_count INTEGER,
description TEXT,
categories TEXT,
hours TEXT,
price TEXT,
suggested_duration TEXT,
latitude REAL,
longitude REAL,
telephone TEXT,
website TEXT,
scraped_at TEXT DEFAULT (datetime('now'))
);
CREATE TABLE IF NOT EXISTS tours (
url TEXT PRIMARY KEY,
name TEXT,
geo_id TEXT,
city_slug TEXT,
price TEXT,
duration TEXT,
rating REAL,
review_count INTEGER,
tour_type TEXT,
scraped_at TEXT DEFAULT (datetime('now'))
);
CREATE INDEX IF NOT EXISTS idx_attractions_geo ON attractions(geo_id);
CREATE INDEX IF NOT EXISTS idx_attractions_rating ON attractions(rating);
""")
conn.commit()
return conn
def build_city_dataset(geo_id: str, city_slug: str,
max_pages: int = 5,
proxy_url: str = None) -> None:
"""Full pipeline: scrape listings + details for a city."""
conn = init_db()
session = create_session_with_warmup(proxy_url)
print(f"\n=== Scraping {city_slug} ===\n")
# Phase 1: Collect listing URLs
listings = get_all_attractions(
geo_id, city_slug,
max_pages=max_pages, session=session
)
# Phase 2: Fetch detail pages
for i, item in enumerate(listings, 1):
url = item.get("url")
if not url or not url.startswith("http"):
continue
# Check if already scraped
if conn.execute("SELECT url FROM attractions WHERE url=?",
(url,)).fetchone():
continue
print(f"[{i}/{len(listings)}] {item.get('name', url[:40])}")
try:
detail = scrape_attraction_detail(url, session=session)
detail.update(item)
detail["geo_id"] = geo_id
detail["city_slug"] = city_slug
conn.execute("""
INSERT OR REPLACE INTO attractions
(url, attraction_id, name, geo_id, city_slug,
rating, review_count, description, categories,
hours, price, suggested_duration,
latitude, longitude)
VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?)
""", (
url, detail.get("attraction_id"),
detail.get("name"), geo_id, city_slug,
detail.get("rating"), detail.get("review_count"),
detail.get("description", "")[:500],
json.dumps(detail.get("categories", [])),
detail.get("hours"), detail.get("price"),
detail.get("suggested_duration"),
detail.get("latitude"), detail.get("longitude"),
))
conn.commit()
print(f" Saved: {detail.get('rating')}/5, "
f"{detail.get('review_count', 0)} reviews")
except Exception as e:
print(f" Error: {e}")
time.sleep(random.uniform(12, 20))
conn.close()
print(f"\nDataset saved to tripadvisor_attractions.db")
# Example
build_city_dataset(
geo_id="g60763",
city_slug="New_York_City_New_York",
max_pages=3,
)
Conclusion
TripAdvisor's attractions data is valuable but well-guarded. The JSON-LD structured data in listing pages is your most reliable extraction target — it's standardized and changes less often than CSS classes. For detail pages and tours, HTML parsing with flexible selectors is necessary. Keep request rates slow (10-20 seconds between pages), use residential proxies from ThorData, and maintain proper session cookies. Their Datadome protection is among the toughest in the industry, so expect to invest in proxy infrastructure for any production-scale scraping. The JSON-LD strategy combined with slow residential browsing is the combination that keeps scraping sessions alive for multi-hour runs rather than dying after a few minutes.
Analyzing Attraction Data
Once you have attractions data in SQLite, useful analysis queries become straightforward:
def analyze_city_attractions(conn: sqlite3.Connection,
city_slug: str) -> None:
"""Print summary statistics for a city's attractions."""
print(f"=== {city_slug} Attraction Analysis ===\n")
# Summary stats
row = conn.execute("""
SELECT COUNT(*) as total,
AVG(rating) as avg_rating,
SUM(review_count) as total_reviews,
MAX(review_count) as max_reviews
FROM attractions
WHERE city_slug = ?
""", (city_slug,)).fetchone()
if row:
print(f"Total attractions: {row[0]}")
print(f"Average rating: {row[1]:.2f}/5")
print(f"Total reviews: {row[2]:,}")
print(f"Most-reviewed: {row[3]:,} reviews")
# Top attractions by review count
print("\nTop 10 by review count:")
for row in conn.execute("""
SELECT name, rating, review_count, categories
FROM attractions
WHERE city_slug = ?
ORDER BY review_count DESC LIMIT 10
""", (city_slug,)):
print(f" {row[0][:40]:40} {row[1] or '?'}★ "
f"{row[2]:>8,} reviews")
# Rating distribution
print("\nRating distribution:")
for row in conn.execute("""
SELECT
CASE
WHEN rating >= 4.5 THEN 'Excellent (4.5-5)'
WHEN rating >= 4.0 THEN 'Very Good (4-4.5)'
WHEN rating >= 3.5 THEN 'Good (3.5-4)'
WHEN rating >= 3.0 THEN 'Average (3-3.5)'
ELSE 'Below Average (<3)'
END as tier,
COUNT(*) as count
FROM attractions
WHERE city_slug = ? AND rating IS NOT NULL
GROUP BY tier
ORDER BY MIN(rating) DESC
""", (city_slug,)):
print(f" {row[0]:25}: {row[1]:4} attractions")
# Category breakdown
print("\nTop categories:")
import json
all_cats = conn.execute(
"SELECT categories FROM attractions WHERE city_slug = ?",
(city_slug,)
).fetchall()
cat_counts = {}
for row in all_cats:
try:
cats = json.loads(row[0] or "[]")
for cat in cats:
if cat:
cat_counts[cat] = cat_counts.get(cat, 0) + 1
except json.JSONDecodeError:
pass
for cat, count in sorted(
cat_counts.items(), key=lambda x: -x[1]
)[:10]:
print(f" {cat:30}: {count}")
def compare_cities(conn: sqlite3.Connection,
city_slugs: list[str]) -> None:
"""Compare key metrics across multiple cities."""
print("City comparison:\n")
print(f"{'City':30} {'Attractions':>12} {'Avg Rating':>12} "
f"{'Total Reviews':>15}")
print("-" * 72)
for slug in city_slugs:
row = conn.execute("""
SELECT COUNT(*), AVG(rating), SUM(review_count)
FROM attractions WHERE city_slug = ?
""", (slug,)).fetchone()
city_name = slug.replace("_", " ").split("-")[0]
print(f"{city_name:30} {row[0]:>12,} "
f"{(row[1] or 0):>12.2f} {(row[2] or 0):>15,}")
Geocoding and Map Integration
When latitude/longitude data is available (extracted from JSON-LD), you can build map visualizations:
def export_geojson(conn: sqlite3.Connection,
city_slug: str,
min_reviews: int = 100,
output_file: str = None) -> dict:
"""Export attraction data as GeoJSON for mapping."""
features = []
for row in conn.execute("""
SELECT name, rating, review_count, categories,
latitude, longitude, url
FROM attractions
WHERE city_slug = ?
AND latitude IS NOT NULL
AND longitude IS NOT NULL
AND review_count >= ?
ORDER BY review_count DESC
""", (city_slug, min_reviews)):
features.append({
"type": "Feature",
"geometry": {
"type": "Point",
"coordinates": [row[5], row[4]], # [lon, lat]
},
"properties": {
"name": row[0],
"rating": row[1],
"review_count": row[2],
"categories": row[3],
"url": row[6],
},
})
geojson = {
"type": "FeatureCollection",
"features": features,
}
if output_file:
with open(output_file, "w") as f:
json.dump(geojson, f, indent=2)
print(f"Exported {len(features)} attractions to {output_file}")
return geojson
# Export NYC attractions with geo coordinates
nyc_geo = export_geojson(
conn, "New_York_City_New_York",
min_reviews=500,
output_file="nyc_attractions.geojson"
)
This GeoJSON output is directly usable in Leaflet.js, Mapbox, or Google Maps to create interactive attraction maps — a common feature in travel apps and local discovery platforms.