Scraping Glassdoor Reviews and Salary Data in 2026 with Playwright

2026-04-09 [scraping glassdoor jobs playwright python salary-data]

Scraping Glassdoor Reviews and Salary Data in 2026

Glassdoor has data that doesn't exist anywhere else: real salary numbers from employees, candid company reviews, actual interview questions. If you're doing competitive intelligence, recruiting analysis, or career research at scale, there's no substitute.

The problem: Glassdoor gates almost everything behind a login wall. You see 1-2 reviews, then a modal blocks the page demanding you sign up and leave your own review. Their anti-bot detection is also above average — DataDome protects most of their endpoints.

Here's how to get the data reliably.

Glassdoor's business model depends on user-generated content. They enforce a "give to get" policy: you must contribute reviews to see reviews. This creates a login wall after the first couple of results on any page.

You have two options: 1. Use a logged-in session — create a free account, authenticate with Playwright, and scrape with full access 2. Intercept the GraphQL API — Glassdoor's frontend makes GraphQL calls that sometimes return data even without full authentication

Option 1 is more reliable. Option 2 is more fragile but doesn't require account management.

Setting Up Authenticated Scraping

import asyncio
import json
from playwright.async_api import async_playwright
from pathlib import Path

STORAGE_PATH = "glassdoor_auth.json"

async def authenticate_glassdoor(email: str, password: str):
    """Log in to Glassdoor and save session state."""
    async with async_playwright() as p:
        browser = await p.chromium.launch(
            headless=False,  # use headed mode for initial auth
            args=["--disable-blink-features=AutomationControlled"]
        )
        context = await browser.new_context(
            viewport={"width": 1920, "height": 1080},
            user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36",
        )

        page = await context.new_page()
        await page.goto("https://www.glassdoor.com/profile/login_input.htm")

        # Fill login form
        await page.fill('[name="username"]', email)
        await page.fill('[name="password"]', password)
        await page.click('[name="submit"]')

        # Wait for redirect after successful login
        await page.wait_for_url("**/member/**", timeout=30000)

        # Save session state (cookies + localStorage)
        await context.storage_state(path=STORAGE_PATH)
        print(f"Session saved to {STORAGE_PATH}")

        await browser.close()

# Run once to create the session file
# asyncio.run(authenticate_glassdoor("[email protected]", "your_password"))

Now use that saved session for scraping:

async def create_glassdoor_context(playwright, proxy_url: str = None):
    """Create an authenticated browser context for Glassdoor."""
    launch_args = [
        "--disable-blink-features=AutomationControlled",
        "--disable-dev-shm-usage",
    ]

    browser = await playwright.chromium.launch(
        headless=True,
        args=launch_args,
    )

    context_opts = {
        "viewport": {"width": 1920, "height": 1080},
        "user_agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36",
        "storage_state": STORAGE_PATH if Path(STORAGE_PATH).exists() else None,
    }

    if proxy_url:
        context_opts["proxy"] = {"server": proxy_url}

    context = await browser.new_context(**context_opts)

    await context.add_init_script("""
        Object.defineProperty(navigator, 'webdriver', { get: () => false });
        window.chrome = { runtime: {} };
    """)

    return browser, context

Scraping Company Reviews

async def scrape_company_reviews(
    company_slug: str,
    max_pages: int = 5,
    proxy_url: str = None,
) -> list[dict]:
    """
    Scrape reviews for a company.
    company_slug example: 'Google-Reviews-E9079' (from the Glassdoor URL)
    """
    reviews = []

    async with async_playwright() as p:
        browser, context = await create_glassdoor_context(p, proxy_url)
        page = await context.new_page()

        for page_num in range(1, max_pages + 1):
            url = f"https://www.glassdoor.com/Reviews/{company_slug}_P{page_num}.htm?sort.sortType=RD"

            await page.goto(url, wait_until="domcontentloaded")
            await page.wait_for_timeout(2000)

            # Extract reviews from the page
            page_reviews = await page.evaluate("""
                () => {
                    const items = document.querySelectorAll('[data-test="review-details"]');
                    if (!items.length) {
                        // Try alternate selector
                        const altItems = document.querySelectorAll('.review-details');
                        if (altItems.length) items = altItems;
                    }

                    return Array.from(document.querySelectorAll(
                        '[id^="empReview_"], [data-test="review-details"]'
                    )).map(el => {
                        const titleEl = el.querySelector('a.reviewLink, [data-test="review-title"]');
                        const ratingEl = el.querySelector('[class*="ratingNumber"]');
                        const prosEl = el.querySelector('[data-test="pros"], .pros');
                        const consEl = el.querySelector('[data-test="cons"], .cons');
                        const dateEl = el.querySelector('.common__EiReviewDetailsStyle__newUiRecDate, time');
                        const roleEl = el.querySelector('.common__EiReviewDetailsStyle__employee, [class*="authorInfo"]');
                        const ceoEl = el.querySelector('[data-test="ceo-approval"]');
                        const recommendEl = el.querySelector('[data-test="recommend"]');

                        return {
                            title: titleEl ? titleEl.innerText.trim() : null,
                            rating: ratingEl ? parseFloat(ratingEl.innerText) : null,
                            pros: prosEl ? prosEl.innerText.trim() : null,
                            cons: consEl ? consEl.innerText.trim() : null,
                            date: dateEl ? dateEl.innerText.trim() : null,
                            role: roleEl ? roleEl.innerText.trim() : null,
                            ceo_approval: ceoEl ? ceoEl.innerText.trim() : null,
                            recommend: recommendEl ? recommendEl.innerText.includes('Recommend') : null,
                        };
                    }).filter(r => r.title || r.pros);
                }
            """)

            reviews.extend(page_reviews)
            print(f"  Page {page_num}: {len(page_reviews)} reviews")

            if not page_reviews:
                break

            # Random delay between pages
            import random
            await page.wait_for_timeout(random.randint(3000, 6000))

        await browser.close()

    return reviews

# Example usage
reviews = asyncio.run(scrape_company_reviews("Google-Reviews-E9079", max_pages=3))
for r in reviews[:5]:
    print(f"{'★' * int(r['rating'] or 0)} {r['title']}")
    print(f"  Pros: {(r['pros'] or '')[:80]}...")
    print(f"  Cons: {(r['cons'] or '')[:80]}...")
    print()

Scraping Salary Data

Glassdoor's salary pages show ranges for specific job titles at specific companies. This data is partially visible without login:

async def scrape_salary_data(
    company_slug: str,
    proxy_url: str = None,
) -> list[dict]:
    """
    Scrape salary data for a company.
    company_slug example: 'Google-Salaries-E9079'
    """
    async with async_playwright() as p:
        browser, context = await create_glassdoor_context(p, proxy_url)
        page = await context.new_page()

        url = f"https://www.glassdoor.com/Salary/{company_slug}.htm"
        await page.goto(url, wait_until="domcontentloaded")
        await page.wait_for_timeout(3000)

        salaries = await page.evaluate("""
            () => {
                const rows = document.querySelectorAll(
                    '[data-test="salary-row"], .salaryRow, tr[id^="salary"]'
                );

                return Array.from(rows).map(row => {
                    const titleEl = row.querySelector('a, [data-test="job-title"]');
                    const payEl = row.querySelector('[data-test="comp-target"], .css-1bluz6i');
                    const rangeEl = row.querySelector('[data-test="salary-range"]');
                    const countEl = row.querySelector('[data-test="salary-count"]');

                    return {
                        job_title: titleEl ? titleEl.innerText.trim() : null,
                        median_pay: payEl ? payEl.innerText.trim() : null,
                        pay_range: rangeEl ? rangeEl.innerText.trim() : null,
                        sample_count: countEl ? countEl.innerText.trim() : null,
                    };
                }).filter(s => s.job_title);
            }
        """)

        await browser.close()

    return salaries

salaries = asyncio.run(scrape_salary_data("Google-Salaries-E9079"))
for s in salaries[:10]:
    print(f"{s['job_title']:40s} {s['median_pay']:>12s}  ({s['sample_count']})")

Scraping Interview Questions

The interview section is one of the most gated parts of Glassdoor. You almost always need a logged-in session:

async def scrape_interview_questions(
    company_slug: str,
    max_pages: int = 3,
    proxy_url: str = None,
) -> list[dict]:
    """
    Scrape interview experiences.
    company_slug example: 'Google-Interview-Questions-E9079'
    """
    interviews = []

    async with async_playwright() as p:
        browser, context = await create_glassdoor_context(p, proxy_url)
        page = await context.new_page()

        for page_num in range(1, max_pages + 1):
            url = f"https://www.glassdoor.com/Interview/{company_slug}_IP{page_num}.htm"
            await page.goto(url, wait_until="domcontentloaded")
            await page.wait_for_timeout(3000)

            page_data = await page.evaluate("""
                () => {
                    return Array.from(document.querySelectorAll(
                        '[id^="InterviewReview_"], .interview-details'
                    )).map(el => {
                        const titleEl = el.querySelector('.interview-title, [class*="title"]');
                        const dateEl = el.querySelector('time, .date');
                        const outcomeEl = el.querySelector('[class*="outcome"]');
                        const difficultyEl = el.querySelector('[class*="difficulty"]');
                        const descEl = el.querySelector('.interviewDescription, [class*="description"]');
                        const questionsEl = el.querySelector('.interviewQuestions, [class*="questions"]');

                        return {
                            role: titleEl ? titleEl.innerText.trim() : null,
                            date: dateEl ? dateEl.innerText.trim() : null,
                            outcome: outcomeEl ? outcomeEl.innerText.trim() : null,
                            difficulty: difficultyEl ? difficultyEl.innerText.trim() : null,
                            description: descEl ? descEl.innerText.trim() : null,
                            questions: questionsEl ? questionsEl.innerText.trim() : null,
                        };
                    }).filter(i => i.role || i.description);
                }
            """)

            interviews.extend(page_data)

            if not page_data:
                break

            import random
            await page.wait_for_timeout(random.randint(4000, 7000))

        await browser.close()

    return interviews

The GraphQL API Approach

Glassdoor's React frontend talks to a GraphQL API. You can intercept these calls and replay them directly — faster than rendering full pages:

async def intercept_graphql_data(company_id: str, proxy_url: str = None) -> dict:
    """Capture Glassdoor's GraphQL responses for a company page."""
    captured = {"reviews": [], "salaries": [], "ratings": None}

    async with async_playwright() as p:
        browser, context = await create_glassdoor_context(p, proxy_url)
        page = await context.new_page()

        async def on_response(response):
            if "graphql" not in response.url:
                return
            try:
                data = await response.json()
                # Glassdoor batches multiple queries in one request
                if isinstance(data, list):
                    for item in data:
                        d = item.get("data", {})
                        if "employerReviews" in d:
                            captured["reviews"].extend(
                                d["employerReviews"].get("reviews", [])
                            )
                        if "salaries" in d:
                            captured["salaries"].extend(
                                d["salaries"].get("results", [])
                            )
                        if "employer" in d and d["employer"]:
                            captured["ratings"] = d["employer"].get("ratings")
            except:
                pass

        page.on("response", on_response)

        url = f"https://www.glassdoor.com/Overview/Working-at-company-EI_IE{company_id}.htm"
        await page.goto(url, wait_until="networkidle")
        await page.wait_for_timeout(5000)

        await browser.close()

    return captured

Anti-Bot Detection: What Glassdoor Uses

Glassdoor runs DataDome on most of their pages. DataDome is one of the harder anti-bot systems to bypass — it scores requests based on a combination of signals:

TLS fingerprint (JA3/JA4 hash) — Python's requests library is instantly identified
Browser fingerprint — canvas hash, WebGL, plugins, timezone
Behavioral signals — mouse movement patterns, scroll behavior
Request patterns — timing, sequence, headers

What works against DataDome on Glassdoor:

Real Chromium via Playwright — not requests, not even curl. You need a real browser.
Stealth patches — hide navigator.webdriver, add realistic plugins
IP rotation — DataDome tracks bot scores per IP. Once flagged, an IP stays flagged for hours. Rotating residential proxies are essential at any volume. ThorData works well here — their residential proxy pool rotates IPs per request, so each page load comes from a fresh IP that DataDome hasn't scored yet.
Realistic timing — 3-7 second random delays between pages, never constant intervals
Session persistence — save and reuse cookies. DataDome assigns a trust score to sessions with history.

# Proxy rotation with per-request IP change
async def scrape_with_rotation(urls: list[str], proxy_base: str) -> list[dict]:
    """Scrape multiple pages with IP rotation."""
    results = []

    async with async_playwright() as p:
        browser = await p.chromium.launch(
            headless=True,
            args=["--disable-blink-features=AutomationControlled"],
        )

        for url in urls:
            # New context per request = new proxy IP
            context = await browser.new_context(
                proxy={"server": proxy_base},
                storage_state=STORAGE_PATH if Path(STORAGE_PATH).exists() else None,
                viewport={"width": 1920, "height": 1080},
                user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36",
            )

            page = await context.new_page()
            try:
                await page.goto(url, wait_until="domcontentloaded", timeout=20000)
                content = await page.content()
                results.append({"url": url, "html": content, "status": "ok"})
            except Exception as e:
                results.append({"url": url, "html": None, "status": str(e)})
            finally:
                await context.close()

            import random
            await asyncio.sleep(random.uniform(3, 7))

        await browser.close()

    return results

Storing and Analyzing the Data

import sqlite3
from datetime import datetime

def store_reviews(company: str, reviews: list[dict], db_path: str = "glassdoor.db"):
    """Store scraped reviews in SQLite for analysis."""
    conn = sqlite3.connect(db_path)
    conn.execute("""
        CREATE TABLE IF NOT EXISTS reviews (
            company TEXT, title TEXT, rating REAL,
            pros TEXT, cons TEXT, role TEXT,
            review_date TEXT, scraped_at TEXT
        )
    """)

    now = datetime.now().isoformat()
    for r in reviews:
        conn.execute(
            "INSERT INTO reviews VALUES (?, ?, ?, ?, ?, ?, ?, ?)",
            (company, r.get("title"), r.get("rating"), r.get("pros"),
             r.get("cons"), r.get("role"), r.get("date"), now)
        )
    conn.commit()
    conn.close()

def analyze_sentiment(db_path: str = "glassdoor.db") -> dict:
    """Basic analysis of stored reviews."""
    conn = sqlite3.connect(db_path)

    stats = {}
    for row in conn.execute("""
        SELECT company, COUNT(*) as cnt, AVG(rating) as avg_rating,
        SUM(CASE WHEN rating >= 4 THEN 1 ELSE 0 END) * 100.0 / COUNT(*) as pct_positive
        FROM reviews GROUP BY company
    """):
        stats[row[0]] = {
            "total_reviews": row[1],
            "avg_rating": round(row[2], 2),
            "pct_positive": round(row[3], 1),
        }
    conn.close()
    return stats

Practical Tips

Account management: Create accounts with different email providers. Glassdoor limits content access per account. If one gets flagged, you have backups.

Rate yourself appropriately: 3-6 second delays between page loads. Glassdoor's DataDome is particularly sensitive to rapid-fire requests. At higher volumes, each request should come from a different IP.

Session files expire: Re-authenticate every few days. Glassdoor invalidates sessions after roughly 7 days of inactivity.

Selectors change: Glassdoor updates their frontend frequently. The CSS selectors in this guide will break eventually. The GraphQL interception approach is more resilient — the API contract changes less often than class names.

Don't scrape what you can get via API: Glassdoor has a partner API for employers. If you're a company scraping your own reviews, apply for API access instead. It's rate-limited but legitimate and stable.

Multi-Company Comparative Analysis

The real value of Glassdoor data is comparing companies in the same industry. Here is how to build a comparative intelligence dataset:

import sqlite3
import json
import statistics
from datetime import datetime

# Example company ID map (Glassdoor company IDs)
TECH_COMPANIES = {
    "Google": "9079",
    "Amazon": "6036",
    "Meta": "40772",
    "Apple": "1138",
    "Microsoft": "1651",
    "Salesforce": "11159",
    "Netflix": "11891",
    "Uber": "575263",
    "Airbnb": "391850",
    "Stripe": "676401",
}


def init_comparative_db(db_path: str = "glassdoor.db") -> sqlite3.Connection:
    conn = sqlite3.connect(db_path)
    conn.executescript("""
        CREATE TABLE IF NOT EXISTS companies (
            company_id TEXT PRIMARY KEY,
            name TEXT,
            industry TEXT,
            scraped_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
        );

        CREATE TABLE IF NOT EXISTS reviews (
            id INTEGER PRIMARY KEY AUTOINCREMENT,
            company TEXT,
            company_id TEXT,
            title TEXT,
            rating REAL,
            pros TEXT,
            cons TEXT,
            role TEXT,
            review_date TEXT,
            recommend INTEGER,
            ceo_approval TEXT,
            scraped_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
        );

        CREATE TABLE IF NOT EXISTS salaries (
            id INTEGER PRIMARY KEY AUTOINCREMENT,
            company TEXT,
            company_id TEXT,
            job_title TEXT,
            median_pay TEXT,
            pay_range TEXT,
            sample_count TEXT,
            scraped_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
        );

        CREATE TABLE IF NOT EXISTS interviews (
            id INTEGER PRIMARY KEY AUTOINCREMENT,
            company TEXT,
            company_id TEXT,
            role TEXT,
            outcome TEXT,
            difficulty TEXT,
            description TEXT,
            questions TEXT,
            interview_date TEXT,
            scraped_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
        );

        CREATE INDEX IF NOT EXISTS idx_reviews_company
            ON reviews(company, rating);
        CREATE INDEX IF NOT EXISTS idx_salaries_title
            ON salaries(job_title, company);
    """)
    conn.commit()
    return conn


def save_reviews_batch(
    conn: sqlite3.Connection,
    company: str,
    company_id: str,
    reviews: list,
):
    """Save a batch of reviews to the database."""
    now = datetime.now().isoformat()
    for r in reviews:
        conn.execute(
            """INSERT INTO reviews
               (company, company_id, title, rating, pros, cons, role,
                review_date, recommend, ceo_approval, scraped_at)
               VALUES (?,?,?,?,?,?,?,?,?,?,?)""",
            (
                company, company_id,
                r.get("title"), r.get("rating"),
                r.get("pros"), r.get("cons"),
                r.get("role"), r.get("date"),
                1 if r.get("recommend") else 0,
                r.get("ceo_approval"),
                now,
            ),
        )
    conn.commit()

Sentiment Analysis on Review Text

Extract structured insights from the free-text pros/cons:

import re

SENTIMENT_KEYWORDS = {
    "work_life_balance": {
        "positive": ["work life balance", "flexible", "remote", "wfh", "unlimited pto"],
        "negative": ["long hours", "no balance", "burnout", "overworked", "60 hours", "always on call"],
    },
    "management": {
        "positive": ["great manager", "supportive leadership", "good management", "transparent"],
        "negative": ["micromanage", "bad manager", "toxic", "poor leadership", "no direction"],
    },
    "compensation": {
        "positive": ["great pay", "competitive salary", "generous", "good benefits", "equity"],
        "negative": ["underpaid", "low salary", "poor compensation", "no raise", "below market"],
    },
    "culture": {
        "positive": ["great culture", "inclusive", "collaborative", "smart coworkers", "fun team"],
        "negative": ["politics", "toxic culture", "layoffs", "high turnover", "cutthroat"],
    },
    "growth": {
        "positive": ["career growth", "learning", "promotions", "development", "mentorship"],
        "negative": ["no growth", "stagnant", "no promotion", "limited learning", "dead end"],
    },
}


def score_review_dimensions(review: dict) -> dict:
    """Score a review across key workplace dimensions."""
    text = ((review.get("pros") or "") + " " + (review.get("cons") or "")).lower()
    scores = {}

    for dimension, keywords in SENTIMENT_KEYWORDS.items():
        positive_hits = sum(1 for kw in keywords["positive"] if kw in text)
        negative_hits = sum(1 for kw in keywords["negative"] if kw in text)

        if positive_hits > 0 or negative_hits > 0:
            scores[dimension] = positive_hits - negative_hits

    return scores


def analyze_company_culture_dimensions(
    company: str,
    db_path: str = "glassdoor.db",
) -> dict:
    """Aggregate sentiment scores across all reviews for a company."""
    conn = sqlite3.connect(db_path)
    reviews = conn.execute(
        "SELECT title, rating, pros, cons, role FROM reviews WHERE company = ?",
        (company,)
    ).fetchall()
    conn.close()

    if not reviews:
        return {"error": "no_reviews", "company": company}

    dimension_scores = {dim: [] for dim in SENTIMENT_KEYWORDS}
    ratings = []

    for row in reviews:
        review = {"title": row[0], "rating": row[1], "pros": row[2], "cons": row[3]}
        if row[1]:
            ratings.append(row[1])
        scores = score_review_dimensions(review)
        for dim, score in scores.items():
            dimension_scores[dim].append(score)

    analysis = {
        "company": company,
        "total_reviews": len(reviews),
        "avg_rating": round(statistics.mean(ratings), 2) if ratings else None,
        "dimensions": {},
    }

    for dim, scores in dimension_scores.items():
        if scores:
            avg = statistics.mean(scores)
            analysis["dimensions"][dim] = {
                "sentiment": "positive" if avg > 0 else "negative" if avg < 0 else "neutral",
                "score": round(avg, 2),
                "mentions": len(scores),
            }

    return analysis

Salary Intelligence Reports

Extract and normalize salary data for market research:

def extract_salary_value(pay_text: str) -> tuple:
    """Extract numeric salary from Glassdoor's pay text."""
    if not pay_text:
        return None, None

    # Patterns: "$X/yr", "$X,XXX", "$XX - $YY"
    matches = re.findall(r'\$([\d,]+)', pay_text.replace(" ", ""))
    numbers = []
    for m in matches:
        try:
            numbers.append(int(m.replace(",", "")))
        except ValueError:
            pass

    if len(numbers) == 1:
        return numbers[0], None
    elif len(numbers) >= 2:
        return numbers[0], numbers[1]
    return None, None


def analyze_salary_by_role(
    role_keywords: list,
    db_path: str = "glassdoor.db",
) -> dict:
    """Compare salaries for a role across companies."""
    conn = sqlite3.connect(db_path)

    query_conditions = " OR ".join(["job_title LIKE ?" for _ in role_keywords])
    params = tuple(f'%{kw}%' for kw in role_keywords)

    rows = conn.execute(
        f"""SELECT company, job_title, median_pay, pay_range, sample_count
           FROM salaries
           WHERE {query_conditions}
           ORDER BY company""",
        params,
    ).fetchall()
    conn.close()

    by_company = {}
    for row in rows:
        company = row[0]
        salary_value, _ = extract_salary_value(row[2])
        if salary_value and salary_value > 30000:  # Filter obviously bad data
            if company not in by_company:
                by_company[company] = []
            by_company[company].append({
                "title": row[1],
                "salary": salary_value,
                "sample_count": row[4],
            })

    analysis = {}
    for company, data in by_company.items():
        salaries = [d["salary"] for d in data]
        if salaries:
            analysis[company] = {
                "median_salary": statistics.median(salaries),
                "min_salary": min(salaries),
                "max_salary": max(salaries),
                "roles_found": len(data),
                "titles": [d["title"] for d in data[:3]],
            }

    return dict(sorted(analysis.items(), key=lambda x: x[1]["median_salary"], reverse=True))


# Compare SWE salaries across big tech
swe_salaries = analyze_salary_by_role(
    ["software engineer", "software developer", "backend engineer"],
    db_path="glassdoor.db",
)
print("Software Engineer salaries by company:")
for company, data in swe_salaries.items():
    print(f"  {company:<20} ${data['median_salary']:,.0f}/yr  ({data['roles_found']} data points)")

Identifying Interview Difficulty Trends

Track how interview processes change over time:

def analyze_interview_patterns(
    company: str,
    db_path: str = "glassdoor.db",
) -> dict:
    """Analyze interview difficulty, common questions, and outcomes."""
    conn = sqlite3.connect(db_path)

    interviews = conn.execute(
        """SELECT role, outcome, difficulty, description, questions
           FROM interviews WHERE company = ?""",
        (company,)
    ).fetchall()
    conn.close()

    if not interviews:
        return {"error": "no_data"}

    outcomes = {}
    difficulties = {}
    common_question_words = {}

    for row in interviews:
        outcome = (row[1] or "").lower()
        difficulty = (row[2] or "").lower()

        outcomes[outcome] = outcomes.get(outcome, 0) + 1
        difficulties[difficulty] = difficulties.get(difficulty, 0) + 1

        # Extract question keywords
        questions_text = row[4] or ""
        words = re.findall(r'\b[a-z]{4,}\b', questions_text.lower())
        for word in words:
            if word not in ["that", "this", "with", "from", "they", "were", "what", "have", "been"]:
                common_question_words[word] = common_question_words.get(word, 0) + 1

    top_questions_words = sorted(common_question_words.items(), key=lambda x: x[1], reverse=True)[:20]

    return {
        "company": company,
        "total_interviews": len(interviews),
        "outcome_distribution": outcomes,
        "difficulty_distribution": difficulties,
        "common_question_themes": [w[0] for w in top_questions_words],
        "positive_outcome_rate": round(
            outcomes.get("accepted", 0) / len(interviews) * 100, 1
        ) if interviews else 0,
    }

Automated Competitive Intelligence Pipeline

import asyncio

async def run_glassdoor_intelligence_pipeline(
    companies: dict,
    db_path: str = "glassdoor.db",
    proxy_url: str = None,
):
    """
    Full competitive intelligence pipeline.
    Collects reviews, salaries, and interview data for multiple companies.
    """
    conn = init_comparative_db(db_path)

    async with async_playwright() as p:
        for company_name, company_id in companies.items():
            print(f"\nProcessing: {company_name} (ID: {company_id})")

            browser, context = await create_glassdoor_context(
                p, proxy_url=proxy_url
            )

            try:
                # Reviews
                review_slug = f"{company_name.replace(' ', '-')}-Reviews-E{company_id}"
                reviews = await scrape_company_reviews(
                    review_slug, max_pages=3, proxy_url=proxy_url
                )
                save_reviews_batch(conn, company_name, company_id, reviews)
                print(f"  Reviews: {len(reviews)}")

                await asyncio.sleep(random.uniform(15, 30))

                # Salaries
                salary_slug = f"{company_name.replace(' ', '-')}-Salaries-E{company_id}"
                salaries = await scrape_salary_data(salary_slug, proxy_url=proxy_url)

                now = datetime.now().isoformat()
                for s in salaries:
                    conn.execute(
                        """INSERT INTO salaries
                           (company, company_id, job_title, median_pay, pay_range, sample_count, scraped_at)
                           VALUES (?,?,?,?,?,?,?)""",
                        (company_name, company_id, s.get("job_title"),
                         s.get("median_pay"), s.get("pay_range"),
                         s.get("sample_count"), now),
                    )
                conn.commit()
                print(f"  Salaries: {len(salaries)}")

                await asyncio.sleep(random.uniform(15, 30))

            except Exception as e:
                print(f"  Error: {e}")
            finally:
                await browser.close()

            await asyncio.sleep(random.uniform(30, 60))

    conn.close()
    print("\nPipeline complete")


# Generate comparative report
async def generate_comparative_report(
    companies: list,
    db_path: str = "glassdoor.db",
) -> dict:
    """Generate a comparative intelligence report across companies."""
    report = {
        "generated_at": datetime.now().isoformat(),
        "companies": {},
    }

    for company in companies:
        culture = analyze_company_culture_dimensions(company, db_path)
        report["companies"][company] = {
            "avg_rating": culture.get("avg_rating"),
            "total_reviews": culture.get("total_reviews"),
            "culture_dimensions": culture.get("dimensions", {}),
            "interview_patterns": analyze_interview_patterns(company, db_path),
        }

    # Rankings
    ranked_by_rating = sorted(
        [(c, d["avg_rating"]) for c, d in report["companies"].items() if d.get("avg_rating")],
        key=lambda x: x[1],
        reverse=True,
    )
    report["rankings"] = {
        "by_overall_rating": [{"company": c, "rating": r} for c, r in ranked_by_rating],
    }

    return report


asyncio.run(run_glassdoor_intelligence_pipeline(
    companies=TECH_COMPANIES,
    proxy_url="http://USER:[email protected]:9000",
))

Key Takeaways for Glassdoor Scraping in 2026

Glassdoor requires a logged-in session for anything beyond the first 1-2 results; create a free account, log in with Playwright once, and save the session state to disk
DataDome protects all Glassdoor pages -- it checks TLS fingerprint, browser properties, and behavioral patterns; Playwright with stealth patches is required
ThorData's residential proxy network handles DataDome's IP reputation scoring; each new browser context rotates to a fresh IP
Per-IP rate limiting is strict: 3-7 second random delays, and create a new browser context per company to rotate IPs
Session files expire after ~7 days -- build in re-authentication logic for long-running pipelines
The GraphQL API interception approach is more resilient than CSS selector parsing -- class names change frequently, API response structure does not
The most valuable use cases: salary benchmarking across companies, culture dimension sentiment analysis, and interview difficulty trend tracking