Scraping TripAdvisor Reviews and Business Data (2026)

2026-04-01 ["tripadvisor" "web-scraping" "playwright" "python" "reviews"]

Scraping TripAdvisor Reviews and Business Data (2026)

TripAdvisor is one of the richest sources of business reviews on the web — restaurants, hotels, attractions, all with detailed ratings, review text, and metadata. Over 1 billion reviews across 8 million businesses. Here's how to scrape it programmatically in 2026.

Understanding TripAdvisor URLs

Every listing follows a predictable pattern:

https://www.tripadvisor.com/Restaurant_Review-g187147-d1751525-Reviews-Le_Cinq-Paris.html

The key parts: g187147 is the geo ID (Paris), d1751525 is the business ID. For pagination, TripAdvisor appends -or10-, -or20-, etc. before the business name — each page shows 10 reviews.

/Restaurant_Review-g187147-d1751525-Reviews-or10-Le_Cinq-Paris.html  # page 2
/Restaurant_Review-g187147-d1751525-Reviews-or20-Le_Cinq-Paris.html  # page 3

Hotels use Hotel_Review, attractions use Attraction_Review. Same structure.

You can also find geo IDs programmatically by scraping TripAdvisor's search TypeAhead endpoint:

import httpx
from urllib.parse import quote

def find_geo_id(city: str) -> dict | None:
    """Find TripAdvisor geo ID and metadata for a city."""
    url = "https://www.tripadvisor.com/TypeAheadJson"
    params = {
        "action": "API",
        "types": "geo",
        "query": city,
        "max": "5",
        "lang": "en_US",
    }
    headers = {
        "User-Agent": (
            "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
            "AppleWebKit/537.36"
        )
    }
    resp = httpx.get(url, params=params, headers=headers, timeout=15)
    results = resp.json().get("results", [])

    if not results:
        return None

    best = results[0]
    return {
        "value": best.get("value"),   # e.g., "g187147"
        "name": best.get("display_name"),
        "type": best.get("type"),
    }


# Example
paris = find_geo_id("Paris, France")
print(f"Paris geo ID: {paris['value']}")

JSON-LD: The Easy Win

Before touching the DOM, check the page source for JSON-LD structured data. TripAdvisor embeds a Restaurant or Hotel schema object with aggregate ratings that is far more stable than CSS class selectors:

from playwright.async_api import async_playwright
import json
import asyncio

async def extract_business_json_ld(url: str, proxy: dict = None) -> dict:
    """Extract structured business data from TripAdvisor JSON-LD."""
    async with async_playwright() as p:
        launch_kwargs = {"headless": True}
        if proxy:
            launch_kwargs["proxy"] = proxy

        browser = await p.chromium.launch(**launch_kwargs)
        context = await browser.new_context(
            user_agent=(
                "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
                "AppleWebKit/537.36 Chrome/122.0.0.0 Safari/537.36"
            ),
            viewport={"width": 1440, "height": 900},
            locale="en-US",
        )
        page = await context.new_page()
        await page.goto(url, wait_until="domcontentloaded", timeout=30000)

        scripts = await page.query_selector_all(
            'script[type="application/ld+json"]'
        )
        business = {}

        for script in scripts:
            try:
                text = await script.inner_text()
                data = json.loads(text)

                target_types = (
                    "Restaurant", "Hotel", "TouristAttraction",
                    "LodgingBusiness", "FoodEstablishment"
                )
                if data.get("@type") in target_types:
                    agg = data.get("aggregateRating", {})
                    address = data.get("address", {})

                    business = {
                        "name": data.get("name"),
                        "type": data.get("@type"),
                        "address": address.get("streetAddress"),
                        "city": address.get("addressLocality"),
                        "country": address.get("addressCountry"),
                        "price_range": data.get("priceRange"),
                        "cuisine": [
                            c.get("name") if isinstance(c, dict) else c
                            for c in data.get("servesCuisine", [])
                        ],
                        "rating": agg.get("ratingValue"),
                        "review_count": agg.get("reviewCount"),
                        "best_rating": agg.get("bestRating"),
                        "telephone": data.get("telephone"),
                        "url": url,
                        "image": (
                            data.get("image", [None])[0]
                            if isinstance(data.get("image"), list)
                            else data.get("image")
                        ),
                    }
                    break

            except (json.JSONDecodeError, AttributeError):
                continue

        await browser.close()
        return business

This gives you name, address, price range, aggregateRating (value + review count), cuisine type, and contact info — all without parsing unstable HTML.

Scraping Full Reviews with Playwright

Reviews lazy-load and truncate behind "Read more" buttons. You need a real browser for the full text:

async def scrape_reviews(url: str, max_pages: int = 5,
                           proxy: dict = None) -> list[dict]:
    """Scrape all reviews from a TripAdvisor listing with pagination."""
    async with async_playwright() as p:
        launch_kwargs = {"headless": True}
        if proxy:
            launch_kwargs["proxy"] = proxy

        browser = await p.chromium.launch(**launch_kwargs)
        context = await browser.new_context(
            user_agent=(
                "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
                "AppleWebKit/537.36 Chrome/122.0.0.0 Safari/537.36"
            ),
            viewport={"width": 1440, "height": 900},
        )
        page = await context.new_page()
        reviews = []

        for i in range(max_pages):
            # Build page URL with pagination offset
            if i == 0:
                page_url = url
            else:
                # Insert pagination offset before the last slug
                parts = url.split("-Reviews-")
                if len(parts) == 2:
                    page_url = (f"{parts[0]}-Reviews-"
                                f"or{i * 10}-{parts[1]}")
                else:
                    break

            await page.goto(page_url, wait_until="networkidle",
                             timeout=30000)

            # Expand truncated reviews — critical for full text
            try:
                read_more = page.locator(
                    '[data-automation="reviewReadMore"], '
                    'button[class*="read-more"]'
                )
                for btn in await read_more.all():
                    try:
                        await btn.click(timeout=2000)
                    except Exception:
                        pass
                await page.wait_for_timeout(500)
            except Exception:
                pass

            # Extract review cards
            cards = page.locator(
                '[data-automation="reviewCard"], '
                '[class*="reviewCard"], '
                '[class*="review-container"]'
            )

            page_reviews = []
            for card in await cards.all():
                try:
                    title_el = card.locator(
                        '[data-automation="reviewTitle"]'
                    )
                    text_el = card.locator(
                        '[data-automation="reviewText"]'
                    )
                    rating_el = card.locator('svg[aria-label]')
                    date_el = card.locator(
                        '.cRVSd, [class*="reviewDate"]'
                    )
                    reviewer_el = card.locator(
                        '[class*="memberName"], '
                        '[data-automation="profileName"]'
                    )
                    trip_type_el = card.locator(
                        '[data-automation="tripType"]'
                    )

                    title = (await title_el.inner_text()
                              if await title_el.count() else "")
                    text = (await text_el.inner_text()
                             if await text_el.count() else "")
                    aria = (await rating_el.get_attribute("aria-label")
                             if await rating_el.count() else "")
                    date = (await date_el.inner_text()
                             if await date_el.count() else "")
                    reviewer = (await reviewer_el.inner_text()
                                 if await reviewer_el.count() else "")
                    trip_type = (await trip_type_el.inner_text()
                                  if await trip_type_el.count() else "")

                    # Parse rating from aria label: "5 of 5 bubbles"
                    rating_value = None
                    if aria and "of 5" in aria:
                        try:
                            rating_value = float(aria.split(" of ")[0])
                        except ValueError:
                            pass

                    if text.strip():
                        page_reviews.append({
                            "title": title.strip(),
                            "text": text.strip(),
                            "rating": rating_value,
                            "date": date.strip(),
                            "reviewer": reviewer.strip(),
                            "trip_type": trip_type.strip(),
                        })

                except Exception:
                    continue

            reviews.extend(page_reviews)
            print(f"  Page {i + 1}: {len(page_reviews)} reviews "
                  f"(total: {len(reviews)})")

            if len(page_reviews) < 8:  # Last page has fewer
                break

            await asyncio.sleep(2.5)

        await browser.close()
        return reviews

TripAdvisor uses data-automation attributes on most interactive elements, which is more reliable than class-based selectors that change with redesigns. The "Read more" click is essential — without it you only get the first ~200 characters of each review.

Extracting Reviewer Metadata

When available, reviewer profiles add useful context — local vs tourist, review history, etc.:

async def scrape_review_with_metadata(card_element) -> dict:
    """Extract extended metadata from a single review card."""
    review = {}

    # Basic review content
    for selector, key in [
        ('[data-automation="reviewTitle"]', "title"),
        ('[data-automation="reviewText"]', "text"),
        ('.cRVSd', "date"),
        ('[class*="memberName"]', "reviewer_name"),
    ]:
        el = card_element.locator(selector)
        if await el.count():
            review[key] = (await el.inner_text()).strip()

    # Rating
    rating_el = card_element.locator('svg[aria-label*="of 5"]')
    if await rating_el.count():
        aria = await rating_el.get_attribute("aria-label")
        try:
            review["rating"] = float(aria.split(" of ")[0])
        except (ValueError, AttributeError):
            review["rating"] = None

    # Reviewer's location
    location_el = card_element.locator('[class*="reviewerInfo"] span')
    locations = []
    for el in await location_el.all():
        text = (await el.inner_text()).strip()
        if text and len(text) > 2:
            locations.append(text)
    review["reviewer_location"] = locations[0] if locations else None

    # Reviewer contribution count
    contrib_el = card_element.locator('[class*="contributions"]')
    if await contrib_el.count():
        text = await contrib_el.inner_text()
        import re
        nums = re.findall(r"\d+", text)
        review["reviewer_contributions"] = int(nums[0]) if nums else None

    # Trip type (solo, family, couple, etc.)
    trip_el = card_element.locator('[data-automation="tripType"]')
    if await trip_el.count():
        review["trip_type"] = (await trip_el.inner_text()).strip()

    return review

Complete Pipeline: Business Data + Reviews + Storage

import sqlite3

def create_db(db_path: str = "tripadvisor_data.db") -> sqlite3.Connection:
    """Initialize SQLite database for TripAdvisor data."""
    conn = sqlite3.connect(db_path)
    conn.executescript("""
        CREATE TABLE IF NOT EXISTS businesses (
            id INTEGER PRIMARY KEY AUTOINCREMENT,
            name TEXT,
            business_type TEXT,
            address TEXT,
            city TEXT,
            country TEXT,
            price_range TEXT,
            cuisine TEXT,
            rating REAL,
            review_count INTEGER,
            telephone TEXT,
            url TEXT UNIQUE,
            scraped_at TEXT DEFAULT (datetime('now'))
        );

        CREATE TABLE IF NOT EXISTS reviews (
            id INTEGER PRIMARY KEY AUTOINCREMENT,
            business_url TEXT,
            title TEXT,
            text TEXT,
            rating REAL,
            date TEXT,
            reviewer_name TEXT,
            reviewer_location TEXT,
            reviewer_contributions INTEGER,
            trip_type TEXT,
            scraped_at TEXT DEFAULT (datetime('now')),
            FOREIGN KEY (business_url) REFERENCES businesses(url)
        );

        CREATE INDEX IF NOT EXISTS idx_reviews_url
            ON reviews(business_url);
        CREATE INDEX IF NOT EXISTS idx_reviews_rating
            ON reviews(rating);
        CREATE INDEX IF NOT EXISTS idx_biz_city
            ON businesses(city);
    """)
    conn.commit()
    return conn


async def full_pipeline(urls: list[str],
                          db_path: str = "tripadvisor_data.db",
                          proxy_config: dict = None,
                          max_review_pages: int = 10) -> None:
    """Scrape business data and reviews for multiple listings."""
    conn = create_db(db_path)

    for url in urls:
        print(f"\nScraping: {url}")

        # Step 1: Extract business metadata from JSON-LD
        biz = await extract_business_json_ld(url, proxy=proxy_config)

        if biz:
            conn.execute("""
                INSERT OR REPLACE INTO businesses
                (name, business_type, address, city, country,
                 price_range, cuisine, rating, review_count,
                 telephone, url)
                VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
            """, (
                biz.get("name"), biz.get("type"),
                biz.get("address"), biz.get("city"),
                biz.get("country"), biz.get("price_range"),
                json.dumps(biz.get("cuisine", [])),
                biz.get("rating"), biz.get("review_count"),
                biz.get("telephone"), url,
            ))
            conn.commit()
            print(f"  {biz.get('name')} — {biz.get('rating')}/5 "
                  f"({biz.get('review_count', 0)} reviews)")

        # Step 2: Scrape full review text with pagination
        reviews = await scrape_reviews(
            url, max_pages=max_review_pages, proxy=proxy_config
        )

        for review in reviews:
            conn.execute("""
                INSERT INTO reviews
                (business_url, title, text, rating, date,
                 reviewer_name, trip_type)
                VALUES (?, ?, ?, ?, ?, ?, ?)
            """, (
                url,
                review.get("title"), review.get("text"),
                review.get("rating"), review.get("date"),
                review.get("reviewer"), review.get("trip_type"),
            ))

        conn.commit()
        print(f"  Saved {len(reviews)} reviews")

        # Pause between businesses
        await asyncio.sleep(random.uniform(5, 10))

    conn.close()
    print("\nPipeline complete.")

Handling TripAdvisor's Anti-Bot Defenses

TripAdvisor aggressively blocks datacenter IPs. After 5-10 requests from the same IP, you'll hit CAPTCHAs or 403s. Residential proxies are non-negotiable.

ThorData provides residential proxies across 195+ countries. Their pool rotates IPs per request, which keeps TripAdvisor from fingerprinting your scraper. City-level targeting helps — if you're scraping Paris restaurants, use French IPs to match the expected traffic pattern.

PROXY_USER = "your_user"
PROXY_PASS = "your_pass"
PROXY_HOST = "proxy.thordata.com"
PROXY_PORT = 9000

def get_proxy_config(country: str = None) -> dict:
    """Build a Playwright proxy config dict."""
    url = (
        f"http://{PROXY_USER}:{PROXY_PASS}"
        f"@{PROXY_HOST}:{PROXY_PORT}"
    )
    if country:
        url += f"?country={country}"

    return {
        "server": f"http://{PROXY_HOST}:{PROXY_PORT}",
        "username": PROXY_USER,
        "password": PROXY_PASS,
    }


# Usage with full pipeline
proxy = get_proxy_config(country="fr")  # French IPs for Paris listings
asyncio.run(full_pipeline(
    urls=["https://www.tripadvisor.com/Restaurant_Review-g187147-d1751525-..."],
    proxy_config=proxy,
    max_review_pages=5,
))

Also rotate user agents and randomize viewport sizes. TripAdvisor checks the full browser fingerprint — not just IP.

Data Analysis Examples

Once you have reviews in SQLite, you can run useful queries:

def analyze_reviews(conn: sqlite3.Connection,
                     business_url: str = None) -> None:
    """Print summary statistics for collected review data."""
    where = ""
    params = ()
    if business_url:
        where = "WHERE r.business_url = ?"
        params = (business_url,)

    # Rating distribution
    print("Rating distribution:")
    for row in conn.execute(f"""
        SELECT CAST(rating AS INT) as stars, COUNT(*) as count
        FROM reviews r {where}
        WHERE rating IS NOT NULL
        GROUP BY stars
        ORDER BY stars DESC
    """, params):
        bar = "█" * (row[1] // 5)
        print(f"  {row[0]}★: {row[1]:5} {bar}")

    # Common words in negative reviews
    print("\nNegative reviews (1-2 stars) sample:")
    for row in conn.execute(f"""
        SELECT text FROM reviews r {where}
        {"AND" if where else "WHERE"} rating <= 2
        ORDER BY RANDOM() LIMIT 5
    """, params):
        print(f"  {row[0][:120]}...")

    # Trip type breakdown
    print("\nTrip type distribution:")
    for row in conn.execute(f"""
        SELECT trip_type, COUNT(*) as count,
               AVG(rating) as avg_rating
        FROM reviews r {where}
        WHERE trip_type != ''
        GROUP BY trip_type
        ORDER BY count DESC
    """, params):
        print(f"  {row[0] or 'Unknown':20}: "
              f"{row[1]:5} reviews, "
              f"{row[2]:.1f}★ avg")

    # Monthly review volume (detect trends)
    print("\nMonthly review volume:")
    for row in conn.execute(f"""
        SELECT SUBSTR(date, 1, 7) as month,
               COUNT(*) as count,
               AVG(rating) as avg_rating
        FROM reviews r {where}
        WHERE date != ''
        GROUP BY month
        ORDER BY month DESC
        LIMIT 12
    """, params):
        print(f"  {row[0]}: {row[1]:4} reviews, {row[2]:.1f}★ avg")


def find_sentiment_patterns(conn: sqlite3.Connection,
                              negative_keywords: list[str] = None) -> None:
    """Find common complaint patterns in low-rated reviews."""
    if not negative_keywords:
        negative_keywords = [
            "slow", "rude", "dirty", "cold", "wait",
            "overpriced", "noisy", "small", "wrong", "missing",
        ]

    print("Complaint keyword frequency in 1-2 star reviews:\n")
    for keyword in negative_keywords:
        count = conn.execute("""
            SELECT COUNT(*) FROM reviews
            WHERE rating <= 2
              AND LOWER(text) LIKE ?
        """, (f"%{keyword}%",)).fetchone()[0]

        if count > 0:
            print(f"  '{keyword}': {count} reviews")

Business Use Cases

Hospitality Competitive Analysis

Hotels and restaurants can monitor competitor reviews to identify service gaps. If a competing hotel consistently gets complaints about slow check-in, that's an opportunity to differentiate. Scrape review text and run basic sentiment analysis to spot patterns.

Local SEO and Reputation Monitoring

Agencies managing multiple restaurant or hotel clients can automate review monitoring. Scrape new reviews daily, flag negative ones for immediate response, and track rating trends over time. Faster response to negative reviews improves overall scores.

Travel Content and Recommendations

Travel blogs and apps can aggregate top-rated businesses by city. Scrape ratings and review counts for restaurants in a target city, filter by cuisine type, and build curated "best of" lists backed by real data rather than subjective picks.

Market Entry Research

Before opening a restaurant or hotel in a new city, scrape competitor data: average ratings, review volume, price ranges, common complaints. This gives you a data-driven understanding of the local market before committing capital.

Review Authenticity Analysis

Compare review patterns across businesses to detect fake reviews. Suspicious signals: review clusters on specific dates, reviewers with only one review, identical phrasing across reviews, sudden rating spikes. Build a simple scoring model from scraped data.

Legal Considerations

TripAdvisor's terms prohibit scraping, but their data is publicly accessible and factual in nature. The hiQ v. LinkedIn precedent (2022) established that scraping public data does not violate the CFAA under US law. That said:

Don't log in to access private data
Don't overload their servers (keep request rates to 10-15 second delays)
Don't republish review text verbatim as your own content
Use scraped data for analysis, not wholesale reproduction
Commercial use cases should get legal review in your jurisdiction

The safest approach: scrape for internal analysis, aggregate statistics, and research. Don't build a competing review site with their data.

NLP Analysis on Review Text

Once you have thousands of reviews in SQLite, natural language processing unlocks deeper insights:

def extract_aspects_from_reviews(conn: sqlite3.Connection,
                                   business_url: str) -> dict:
    """
    Basic aspect-based sentiment analysis on review text.
    Identifies specific service aspects mentioned in reviews.
    """
    aspect_keywords = {
        "food": ["food", "dish", "meal", "menu", "taste", "flavor",
                 "cuisine", "delicious", "bland", "fresh", "portion"],
        "service": ["service", "staff", "waiter", "server", "friendly",
                    "rude", "attentive", "slow", "fast", "polite"],
        "atmosphere": ["atmosphere", "ambiance", "decor", "interior",
                       "noisy", "quiet", "romantic", "cozy", "busy"],
        "price": ["price", "expensive", "cheap", "worth", "value",
                  "overpriced", "affordable", "cost"],
        "location": ["location", "parking", "street", "area",
                     "accessible", "central", "remote"],
    }

    aspect_ratings = {aspect: [] for aspect in aspect_keywords}

    for row in conn.execute("""
        SELECT text, rating FROM reviews
        WHERE business_url = ? AND rating IS NOT NULL
    """, (business_url,)):
        text_lower = row[0].lower() if row[0] else ""
        rating = row[1]

        for aspect, keywords in aspect_keywords.items():
            if any(kw in text_lower for kw in keywords):
                aspect_ratings[aspect].append(rating)

    results = {}
    for aspect, ratings in aspect_ratings.items():
        if ratings:
            results[aspect] = {
                "mentions": len(ratings),
                "avg_rating": sum(ratings) / len(ratings),
                "positive": sum(1 for r in ratings if r >= 4),
                "negative": sum(1 for r in ratings if r <= 2),
            }

    return results


def find_trending_topics(conn: sqlite3.Connection,
                          recent_days: int = 30) -> list[tuple]:
    """Find topics more common in recent reviews vs older ones."""
    import re
    from collections import Counter
    from datetime import datetime, timedelta

    cutoff = (datetime.utcnow() - timedelta(days=recent_days)).isoformat()

    recent_words = Counter()
    older_words = Counter()

    for row in conn.execute("""
        SELECT text, scraped_at FROM reviews
        WHERE text IS NOT NULL
    """):
        words = re.findall(r'\b[a-z]{4,}\b', (row[0] or "").lower())
        if row[1] and row[1] > cutoff:
            recent_words.update(words)
        else:
            older_words.update(words)

    # Find words that appear more in recent reviews
    stopwords = {
        "this", "that", "with", "have", "been", "were",
        "from", "they", "will", "very", "just", "also",
        "good", "great", "nice", "best", "more", "very",
    }

    trending = []
    for word, count in recent_words.most_common(200):
        if word in stopwords or count < 5:
            continue
        older_count = older_words.get(word, 0)
        if older_count > 0:
            growth = (count - older_count) / older_count
            if growth > 0.5:  # 50%+ increase
                trending.append((word, count, growth))

    return sorted(trending, key=lambda x: -x[2])[:20]

Automating Review Collection with a Scheduler

For ongoing review monitoring, schedule the scraper to run daily:

import schedule
import time as time_module

def daily_review_job(urls: list[str],
                      db_path: str = "tripadvisor_data.db") -> None:
    """Run the review collection pipeline as a scheduled job."""
    print(f"Starting daily review collection for {len(urls)} businesses...")

    proxy = get_proxy_config(country="us")

    try:
        asyncio.run(full_pipeline(
            urls=urls,
            db_path=db_path,
            proxy_config=proxy,
            max_review_pages=3,  # Only latest reviews
        ))
        print("Daily collection complete.")
    except Exception as e:
        print(f"Collection error: {e}")


# Define your monitored business URLs
MONITORED = [
    "https://www.tripadvisor.com/Restaurant_Review-...",
    "https://www.tripadvisor.com/Hotel_Review-...",
]

# Schedule to run at 3 AM daily
schedule.every().day.at("03:00").do(daily_review_job, urls=MONITORED)

if __name__ == "__main__":
    print("Review monitor started. Press Ctrl+C to stop.")
    while True:
        schedule.run_pending()
        time_module.sleep(60)

This creates a lightweight monitoring system that collects new reviews nightly without manual intervention. Combined with email alerts for negative reviews (rating ≤ 2), it's a functional reputation monitoring product that hospitality businesses will pay for.