Scraping Glassdoor Reviews and Salary Data in 2026 with Playwright
Scraping Glassdoor Reviews and Salary Data in 2026
Glassdoor has data that doesn't exist anywhere else: real salary numbers from employees, candid company reviews, actual interview questions. If you're doing competitive intelligence, recruiting analysis, or career research at scale, there's no substitute.
The problem: Glassdoor gates almost everything behind a login wall. You see 1-2 reviews, then a modal blocks the page demanding you sign up and leave your own review. Their anti-bot detection is also above average — DataDome protects most of their endpoints.
Here's how to get the data reliably.
The Login Wall Problem
Glassdoor's business model depends on user-generated content. They enforce a "give to get" policy: you must contribute reviews to see reviews. This creates a login wall after the first couple of results on any page.
You have two options: 1. Use a logged-in session — create a free account, authenticate with Playwright, and scrape with full access 2. Intercept the GraphQL API — Glassdoor's frontend makes GraphQL calls that sometimes return data even without full authentication
Option 1 is more reliable. Option 2 is more fragile but doesn't require account management.
Setting Up Authenticated Scraping
import asyncio
import json
from playwright.async_api import async_playwright
from pathlib import Path
STORAGE_PATH = "glassdoor_auth.json"
async def authenticate_glassdoor(email: str, password: str):
"""Log in to Glassdoor and save session state."""
async with async_playwright() as p:
browser = await p.chromium.launch(
headless=False, # use headed mode for initial auth
args=["--disable-blink-features=AutomationControlled"]
)
context = await browser.new_context(
viewport={"width": 1920, "height": 1080},
user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36",
)
page = await context.new_page()
await page.goto("https://www.glassdoor.com/profile/login_input.htm")
# Fill login form
await page.fill('[name="username"]', email)
await page.fill('[name="password"]', password)
await page.click('[name="submit"]')
# Wait for redirect after successful login
await page.wait_for_url("**/member/**", timeout=30000)
# Save session state (cookies + localStorage)
await context.storage_state(path=STORAGE_PATH)
print(f"Session saved to {STORAGE_PATH}")
await browser.close()
# Run once to create the session file
# asyncio.run(authenticate_glassdoor("[email protected]", "your_password"))
Now use that saved session for scraping:
async def create_glassdoor_context(playwright, proxy_url: str = None):
"""Create an authenticated browser context for Glassdoor."""
launch_args = [
"--disable-blink-features=AutomationControlled",
"--disable-dev-shm-usage",
]
browser = await playwright.chromium.launch(
headless=True,
args=launch_args,
)
context_opts = {
"viewport": {"width": 1920, "height": 1080},
"user_agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36",
"storage_state": STORAGE_PATH if Path(STORAGE_PATH).exists() else None,
}
if proxy_url:
context_opts["proxy"] = {"server": proxy_url}
context = await browser.new_context(**context_opts)
await context.add_init_script("""
Object.defineProperty(navigator, 'webdriver', { get: () => false });
window.chrome = { runtime: {} };
""")
return browser, context
Scraping Company Reviews
async def scrape_company_reviews(
company_slug: str,
max_pages: int = 5,
proxy_url: str = None,
) -> list[dict]:
"""
Scrape reviews for a company.
company_slug example: 'Google-Reviews-E9079' (from the Glassdoor URL)
"""
reviews = []
async with async_playwright() as p:
browser, context = await create_glassdoor_context(p, proxy_url)
page = await context.new_page()
for page_num in range(1, max_pages + 1):
url = f"https://www.glassdoor.com/Reviews/{company_slug}_P{page_num}.htm?sort.sortType=RD"
await page.goto(url, wait_until="domcontentloaded")
await page.wait_for_timeout(2000)
# Extract reviews from the page
page_reviews = await page.evaluate("""
() => {
const items = document.querySelectorAll('[data-test="review-details"]');
if (!items.length) {
// Try alternate selector
const altItems = document.querySelectorAll('.review-details');
if (altItems.length) items = altItems;
}
return Array.from(document.querySelectorAll(
'[id^="empReview_"], [data-test="review-details"]'
)).map(el => {
const titleEl = el.querySelector('a.reviewLink, [data-test="review-title"]');
const ratingEl = el.querySelector('[class*="ratingNumber"]');
const prosEl = el.querySelector('[data-test="pros"], .pros');
const consEl = el.querySelector('[data-test="cons"], .cons');
const dateEl = el.querySelector('.common__EiReviewDetailsStyle__newUiRecDate, time');
const roleEl = el.querySelector('.common__EiReviewDetailsStyle__employee, [class*="authorInfo"]');
const ceoEl = el.querySelector('[data-test="ceo-approval"]');
const recommendEl = el.querySelector('[data-test="recommend"]');
return {
title: titleEl ? titleEl.innerText.trim() : null,
rating: ratingEl ? parseFloat(ratingEl.innerText) : null,
pros: prosEl ? prosEl.innerText.trim() : null,
cons: consEl ? consEl.innerText.trim() : null,
date: dateEl ? dateEl.innerText.trim() : null,
role: roleEl ? roleEl.innerText.trim() : null,
ceo_approval: ceoEl ? ceoEl.innerText.trim() : null,
recommend: recommendEl ? recommendEl.innerText.includes('Recommend') : null,
};
}).filter(r => r.title || r.pros);
}
""")
reviews.extend(page_reviews)
print(f" Page {page_num}: {len(page_reviews)} reviews")
if not page_reviews:
break
# Random delay between pages
import random
await page.wait_for_timeout(random.randint(3000, 6000))
await browser.close()
return reviews
# Example usage
reviews = asyncio.run(scrape_company_reviews("Google-Reviews-E9079", max_pages=3))
for r in reviews[:5]:
print(f"{'★' * int(r['rating'] or 0)} {r['title']}")
print(f" Pros: {(r['pros'] or '')[:80]}...")
print(f" Cons: {(r['cons'] or '')[:80]}...")
print()
Scraping Salary Data
Glassdoor's salary pages show ranges for specific job titles at specific companies. This data is partially visible without login:
async def scrape_salary_data(
company_slug: str,
proxy_url: str = None,
) -> list[dict]:
"""
Scrape salary data for a company.
company_slug example: 'Google-Salaries-E9079'
"""
async with async_playwright() as p:
browser, context = await create_glassdoor_context(p, proxy_url)
page = await context.new_page()
url = f"https://www.glassdoor.com/Salary/{company_slug}.htm"
await page.goto(url, wait_until="domcontentloaded")
await page.wait_for_timeout(3000)
salaries = await page.evaluate("""
() => {
const rows = document.querySelectorAll(
'[data-test="salary-row"], .salaryRow, tr[id^="salary"]'
);
return Array.from(rows).map(row => {
const titleEl = row.querySelector('a, [data-test="job-title"]');
const payEl = row.querySelector('[data-test="comp-target"], .css-1bluz6i');
const rangeEl = row.querySelector('[data-test="salary-range"]');
const countEl = row.querySelector('[data-test="salary-count"]');
return {
job_title: titleEl ? titleEl.innerText.trim() : null,
median_pay: payEl ? payEl.innerText.trim() : null,
pay_range: rangeEl ? rangeEl.innerText.trim() : null,
sample_count: countEl ? countEl.innerText.trim() : null,
};
}).filter(s => s.job_title);
}
""")
await browser.close()
return salaries
salaries = asyncio.run(scrape_salary_data("Google-Salaries-E9079"))
for s in salaries[:10]:
print(f"{s['job_title']:40s} {s['median_pay']:>12s} ({s['sample_count']})")
Scraping Interview Questions
The interview section is one of the most gated parts of Glassdoor. You almost always need a logged-in session:
async def scrape_interview_questions(
company_slug: str,
max_pages: int = 3,
proxy_url: str = None,
) -> list[dict]:
"""
Scrape interview experiences.
company_slug example: 'Google-Interview-Questions-E9079'
"""
interviews = []
async with async_playwright() as p:
browser, context = await create_glassdoor_context(p, proxy_url)
page = await context.new_page()
for page_num in range(1, max_pages + 1):
url = f"https://www.glassdoor.com/Interview/{company_slug}_IP{page_num}.htm"
await page.goto(url, wait_until="domcontentloaded")
await page.wait_for_timeout(3000)
page_data = await page.evaluate("""
() => {
return Array.from(document.querySelectorAll(
'[id^="InterviewReview_"], .interview-details'
)).map(el => {
const titleEl = el.querySelector('.interview-title, [class*="title"]');
const dateEl = el.querySelector('time, .date');
const outcomeEl = el.querySelector('[class*="outcome"]');
const difficultyEl = el.querySelector('[class*="difficulty"]');
const descEl = el.querySelector('.interviewDescription, [class*="description"]');
const questionsEl = el.querySelector('.interviewQuestions, [class*="questions"]');
return {
role: titleEl ? titleEl.innerText.trim() : null,
date: dateEl ? dateEl.innerText.trim() : null,
outcome: outcomeEl ? outcomeEl.innerText.trim() : null,
difficulty: difficultyEl ? difficultyEl.innerText.trim() : null,
description: descEl ? descEl.innerText.trim() : null,
questions: questionsEl ? questionsEl.innerText.trim() : null,
};
}).filter(i => i.role || i.description);
}
""")
interviews.extend(page_data)
if not page_data:
break
import random
await page.wait_for_timeout(random.randint(4000, 7000))
await browser.close()
return interviews
The GraphQL API Approach
Glassdoor's React frontend talks to a GraphQL API. You can intercept these calls and replay them directly — faster than rendering full pages:
async def intercept_graphql_data(company_id: str, proxy_url: str = None) -> dict:
"""Capture Glassdoor's GraphQL responses for a company page."""
captured = {"reviews": [], "salaries": [], "ratings": None}
async with async_playwright() as p:
browser, context = await create_glassdoor_context(p, proxy_url)
page = await context.new_page()
async def on_response(response):
if "graphql" not in response.url:
return
try:
data = await response.json()
# Glassdoor batches multiple queries in one request
if isinstance(data, list):
for item in data:
d = item.get("data", {})
if "employerReviews" in d:
captured["reviews"].extend(
d["employerReviews"].get("reviews", [])
)
if "salaries" in d:
captured["salaries"].extend(
d["salaries"].get("results", [])
)
if "employer" in d and d["employer"]:
captured["ratings"] = d["employer"].get("ratings")
except:
pass
page.on("response", on_response)
url = f"https://www.glassdoor.com/Overview/Working-at-company-EI_IE{company_id}.htm"
await page.goto(url, wait_until="networkidle")
await page.wait_for_timeout(5000)
await browser.close()
return captured
Anti-Bot Detection: What Glassdoor Uses
Glassdoor runs DataDome on most of their pages. DataDome is one of the harder anti-bot systems to bypass — it scores requests based on a combination of signals:
- TLS fingerprint (JA3/JA4 hash) — Python's requests library is instantly identified
- Browser fingerprint — canvas hash, WebGL, plugins, timezone
- Behavioral signals — mouse movement patterns, scroll behavior
- Request patterns — timing, sequence, headers
What works against DataDome on Glassdoor:
- Real Chromium via Playwright — not requests, not even curl. You need a real browser.
- Stealth patches — hide
navigator.webdriver, add realistic plugins - IP rotation — DataDome tracks bot scores per IP. Once flagged, an IP stays flagged for hours. Rotating residential proxies are essential at any volume. ThorData works well here — their residential proxy pool rotates IPs per request, so each page load comes from a fresh IP that DataDome hasn't scored yet.
- Realistic timing — 3-7 second random delays between pages, never constant intervals
- Session persistence — save and reuse cookies. DataDome assigns a trust score to sessions with history.
# Proxy rotation with per-request IP change
async def scrape_with_rotation(urls: list[str], proxy_base: str) -> list[dict]:
"""Scrape multiple pages with IP rotation."""
results = []
async with async_playwright() as p:
browser = await p.chromium.launch(
headless=True,
args=["--disable-blink-features=AutomationControlled"],
)
for url in urls:
# New context per request = new proxy IP
context = await browser.new_context(
proxy={"server": proxy_base},
storage_state=STORAGE_PATH if Path(STORAGE_PATH).exists() else None,
viewport={"width": 1920, "height": 1080},
user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36",
)
page = await context.new_page()
try:
await page.goto(url, wait_until="domcontentloaded", timeout=20000)
content = await page.content()
results.append({"url": url, "html": content, "status": "ok"})
except Exception as e:
results.append({"url": url, "html": None, "status": str(e)})
finally:
await context.close()
import random
await asyncio.sleep(random.uniform(3, 7))
await browser.close()
return results
Storing and Analyzing the Data
import sqlite3
from datetime import datetime
def store_reviews(company: str, reviews: list[dict], db_path: str = "glassdoor.db"):
"""Store scraped reviews in SQLite for analysis."""
conn = sqlite3.connect(db_path)
conn.execute("""
CREATE TABLE IF NOT EXISTS reviews (
company TEXT, title TEXT, rating REAL,
pros TEXT, cons TEXT, role TEXT,
review_date TEXT, scraped_at TEXT
)
""")
now = datetime.now().isoformat()
for r in reviews:
conn.execute(
"INSERT INTO reviews VALUES (?, ?, ?, ?, ?, ?, ?, ?)",
(company, r.get("title"), r.get("rating"), r.get("pros"),
r.get("cons"), r.get("role"), r.get("date"), now)
)
conn.commit()
conn.close()
def analyze_sentiment(db_path: str = "glassdoor.db") -> dict:
"""Basic analysis of stored reviews."""
conn = sqlite3.connect(db_path)
stats = {}
for row in conn.execute("""
SELECT company, COUNT(*) as cnt, AVG(rating) as avg_rating,
SUM(CASE WHEN rating >= 4 THEN 1 ELSE 0 END) * 100.0 / COUNT(*) as pct_positive
FROM reviews GROUP BY company
"""):
stats[row[0]] = {
"total_reviews": row[1],
"avg_rating": round(row[2], 2),
"pct_positive": round(row[3], 1),
}
conn.close()
return stats
Practical Tips
Account management: Create accounts with different email providers. Glassdoor limits content access per account. If one gets flagged, you have backups.
Rate yourself appropriately: 3-6 second delays between page loads. Glassdoor's DataDome is particularly sensitive to rapid-fire requests. At higher volumes, each request should come from a different IP.
Session files expire: Re-authenticate every few days. Glassdoor invalidates sessions after roughly 7 days of inactivity.
Selectors change: Glassdoor updates their frontend frequently. The CSS selectors in this guide will break eventually. The GraphQL interception approach is more resilient — the API contract changes less often than class names.
Don't scrape what you can get via API: Glassdoor has a partner API for employers. If you're a company scraping your own reviews, apply for API access instead. It's rate-limited but legitimate and stable.
Multi-Company Comparative Analysis
The real value of Glassdoor data is comparing companies in the same industry. Here is how to build a comparative intelligence dataset:
import sqlite3
import json
import statistics
from datetime import datetime
# Example company ID map (Glassdoor company IDs)
TECH_COMPANIES = {
"Google": "9079",
"Amazon": "6036",
"Meta": "40772",
"Apple": "1138",
"Microsoft": "1651",
"Salesforce": "11159",
"Netflix": "11891",
"Uber": "575263",
"Airbnb": "391850",
"Stripe": "676401",
}
def init_comparative_db(db_path: str = "glassdoor.db") -> sqlite3.Connection:
conn = sqlite3.connect(db_path)
conn.executescript("""
CREATE TABLE IF NOT EXISTS companies (
company_id TEXT PRIMARY KEY,
name TEXT,
industry TEXT,
scraped_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
);
CREATE TABLE IF NOT EXISTS reviews (
id INTEGER PRIMARY KEY AUTOINCREMENT,
company TEXT,
company_id TEXT,
title TEXT,
rating REAL,
pros TEXT,
cons TEXT,
role TEXT,
review_date TEXT,
recommend INTEGER,
ceo_approval TEXT,
scraped_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
);
CREATE TABLE IF NOT EXISTS salaries (
id INTEGER PRIMARY KEY AUTOINCREMENT,
company TEXT,
company_id TEXT,
job_title TEXT,
median_pay TEXT,
pay_range TEXT,
sample_count TEXT,
scraped_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
);
CREATE TABLE IF NOT EXISTS interviews (
id INTEGER PRIMARY KEY AUTOINCREMENT,
company TEXT,
company_id TEXT,
role TEXT,
outcome TEXT,
difficulty TEXT,
description TEXT,
questions TEXT,
interview_date TEXT,
scraped_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
);
CREATE INDEX IF NOT EXISTS idx_reviews_company
ON reviews(company, rating);
CREATE INDEX IF NOT EXISTS idx_salaries_title
ON salaries(job_title, company);
""")
conn.commit()
return conn
def save_reviews_batch(
conn: sqlite3.Connection,
company: str,
company_id: str,
reviews: list,
):
"""Save a batch of reviews to the database."""
now = datetime.now().isoformat()
for r in reviews:
conn.execute(
"""INSERT INTO reviews
(company, company_id, title, rating, pros, cons, role,
review_date, recommend, ceo_approval, scraped_at)
VALUES (?,?,?,?,?,?,?,?,?,?,?)""",
(
company, company_id,
r.get("title"), r.get("rating"),
r.get("pros"), r.get("cons"),
r.get("role"), r.get("date"),
1 if r.get("recommend") else 0,
r.get("ceo_approval"),
now,
),
)
conn.commit()
Sentiment Analysis on Review Text
Extract structured insights from the free-text pros/cons:
import re
SENTIMENT_KEYWORDS = {
"work_life_balance": {
"positive": ["work life balance", "flexible", "remote", "wfh", "unlimited pto"],
"negative": ["long hours", "no balance", "burnout", "overworked", "60 hours", "always on call"],
},
"management": {
"positive": ["great manager", "supportive leadership", "good management", "transparent"],
"negative": ["micromanage", "bad manager", "toxic", "poor leadership", "no direction"],
},
"compensation": {
"positive": ["great pay", "competitive salary", "generous", "good benefits", "equity"],
"negative": ["underpaid", "low salary", "poor compensation", "no raise", "below market"],
},
"culture": {
"positive": ["great culture", "inclusive", "collaborative", "smart coworkers", "fun team"],
"negative": ["politics", "toxic culture", "layoffs", "high turnover", "cutthroat"],
},
"growth": {
"positive": ["career growth", "learning", "promotions", "development", "mentorship"],
"negative": ["no growth", "stagnant", "no promotion", "limited learning", "dead end"],
},
}
def score_review_dimensions(review: dict) -> dict:
"""Score a review across key workplace dimensions."""
text = ((review.get("pros") or "") + " " + (review.get("cons") or "")).lower()
scores = {}
for dimension, keywords in SENTIMENT_KEYWORDS.items():
positive_hits = sum(1 for kw in keywords["positive"] if kw in text)
negative_hits = sum(1 for kw in keywords["negative"] if kw in text)
if positive_hits > 0 or negative_hits > 0:
scores[dimension] = positive_hits - negative_hits
return scores
def analyze_company_culture_dimensions(
company: str,
db_path: str = "glassdoor.db",
) -> dict:
"""Aggregate sentiment scores across all reviews for a company."""
conn = sqlite3.connect(db_path)
reviews = conn.execute(
"SELECT title, rating, pros, cons, role FROM reviews WHERE company = ?",
(company,)
).fetchall()
conn.close()
if not reviews:
return {"error": "no_reviews", "company": company}
dimension_scores = {dim: [] for dim in SENTIMENT_KEYWORDS}
ratings = []
for row in reviews:
review = {"title": row[0], "rating": row[1], "pros": row[2], "cons": row[3]}
if row[1]:
ratings.append(row[1])
scores = score_review_dimensions(review)
for dim, score in scores.items():
dimension_scores[dim].append(score)
analysis = {
"company": company,
"total_reviews": len(reviews),
"avg_rating": round(statistics.mean(ratings), 2) if ratings else None,
"dimensions": {},
}
for dim, scores in dimension_scores.items():
if scores:
avg = statistics.mean(scores)
analysis["dimensions"][dim] = {
"sentiment": "positive" if avg > 0 else "negative" if avg < 0 else "neutral",
"score": round(avg, 2),
"mentions": len(scores),
}
return analysis
Salary Intelligence Reports
Extract and normalize salary data for market research:
def extract_salary_value(pay_text: str) -> tuple:
"""Extract numeric salary from Glassdoor's pay text."""
if not pay_text:
return None, None
# Patterns: "$X/yr", "$X,XXX", "$XX - $YY"
matches = re.findall(r'\$([\d,]+)', pay_text.replace(" ", ""))
numbers = []
for m in matches:
try:
numbers.append(int(m.replace(",", "")))
except ValueError:
pass
if len(numbers) == 1:
return numbers[0], None
elif len(numbers) >= 2:
return numbers[0], numbers[1]
return None, None
def analyze_salary_by_role(
role_keywords: list,
db_path: str = "glassdoor.db",
) -> dict:
"""Compare salaries for a role across companies."""
conn = sqlite3.connect(db_path)
query_conditions = " OR ".join(["job_title LIKE ?" for _ in role_keywords])
params = tuple(f'%{kw}%' for kw in role_keywords)
rows = conn.execute(
f"""SELECT company, job_title, median_pay, pay_range, sample_count
FROM salaries
WHERE {query_conditions}
ORDER BY company""",
params,
).fetchall()
conn.close()
by_company = {}
for row in rows:
company = row[0]
salary_value, _ = extract_salary_value(row[2])
if salary_value and salary_value > 30000: # Filter obviously bad data
if company not in by_company:
by_company[company] = []
by_company[company].append({
"title": row[1],
"salary": salary_value,
"sample_count": row[4],
})
analysis = {}
for company, data in by_company.items():
salaries = [d["salary"] for d in data]
if salaries:
analysis[company] = {
"median_salary": statistics.median(salaries),
"min_salary": min(salaries),
"max_salary": max(salaries),
"roles_found": len(data),
"titles": [d["title"] for d in data[:3]],
}
return dict(sorted(analysis.items(), key=lambda x: x[1]["median_salary"], reverse=True))
# Compare SWE salaries across big tech
swe_salaries = analyze_salary_by_role(
["software engineer", "software developer", "backend engineer"],
db_path="glassdoor.db",
)
print("Software Engineer salaries by company:")
for company, data in swe_salaries.items():
print(f" {company:<20} ${data['median_salary']:,.0f}/yr ({data['roles_found']} data points)")
Identifying Interview Difficulty Trends
Track how interview processes change over time:
def analyze_interview_patterns(
company: str,
db_path: str = "glassdoor.db",
) -> dict:
"""Analyze interview difficulty, common questions, and outcomes."""
conn = sqlite3.connect(db_path)
interviews = conn.execute(
"""SELECT role, outcome, difficulty, description, questions
FROM interviews WHERE company = ?""",
(company,)
).fetchall()
conn.close()
if not interviews:
return {"error": "no_data"}
outcomes = {}
difficulties = {}
common_question_words = {}
for row in interviews:
outcome = (row[1] or "").lower()
difficulty = (row[2] or "").lower()
outcomes[outcome] = outcomes.get(outcome, 0) + 1
difficulties[difficulty] = difficulties.get(difficulty, 0) + 1
# Extract question keywords
questions_text = row[4] or ""
words = re.findall(r'\b[a-z]{4,}\b', questions_text.lower())
for word in words:
if word not in ["that", "this", "with", "from", "they", "were", "what", "have", "been"]:
common_question_words[word] = common_question_words.get(word, 0) + 1
top_questions_words = sorted(common_question_words.items(), key=lambda x: x[1], reverse=True)[:20]
return {
"company": company,
"total_interviews": len(interviews),
"outcome_distribution": outcomes,
"difficulty_distribution": difficulties,
"common_question_themes": [w[0] for w in top_questions_words],
"positive_outcome_rate": round(
outcomes.get("accepted", 0) / len(interviews) * 100, 1
) if interviews else 0,
}
Automated Competitive Intelligence Pipeline
import asyncio
async def run_glassdoor_intelligence_pipeline(
companies: dict,
db_path: str = "glassdoor.db",
proxy_url: str = None,
):
"""
Full competitive intelligence pipeline.
Collects reviews, salaries, and interview data for multiple companies.
"""
conn = init_comparative_db(db_path)
async with async_playwright() as p:
for company_name, company_id in companies.items():
print(f"\nProcessing: {company_name} (ID: {company_id})")
browser, context = await create_glassdoor_context(
p, proxy_url=proxy_url
)
try:
# Reviews
review_slug = f"{company_name.replace(' ', '-')}-Reviews-E{company_id}"
reviews = await scrape_company_reviews(
review_slug, max_pages=3, proxy_url=proxy_url
)
save_reviews_batch(conn, company_name, company_id, reviews)
print(f" Reviews: {len(reviews)}")
await asyncio.sleep(random.uniform(15, 30))
# Salaries
salary_slug = f"{company_name.replace(' ', '-')}-Salaries-E{company_id}"
salaries = await scrape_salary_data(salary_slug, proxy_url=proxy_url)
now = datetime.now().isoformat()
for s in salaries:
conn.execute(
"""INSERT INTO salaries
(company, company_id, job_title, median_pay, pay_range, sample_count, scraped_at)
VALUES (?,?,?,?,?,?,?)""",
(company_name, company_id, s.get("job_title"),
s.get("median_pay"), s.get("pay_range"),
s.get("sample_count"), now),
)
conn.commit()
print(f" Salaries: {len(salaries)}")
await asyncio.sleep(random.uniform(15, 30))
except Exception as e:
print(f" Error: {e}")
finally:
await browser.close()
await asyncio.sleep(random.uniform(30, 60))
conn.close()
print("\nPipeline complete")
# Generate comparative report
async def generate_comparative_report(
companies: list,
db_path: str = "glassdoor.db",
) -> dict:
"""Generate a comparative intelligence report across companies."""
report = {
"generated_at": datetime.now().isoformat(),
"companies": {},
}
for company in companies:
culture = analyze_company_culture_dimensions(company, db_path)
report["companies"][company] = {
"avg_rating": culture.get("avg_rating"),
"total_reviews": culture.get("total_reviews"),
"culture_dimensions": culture.get("dimensions", {}),
"interview_patterns": analyze_interview_patterns(company, db_path),
}
# Rankings
ranked_by_rating = sorted(
[(c, d["avg_rating"]) for c, d in report["companies"].items() if d.get("avg_rating")],
key=lambda x: x[1],
reverse=True,
)
report["rankings"] = {
"by_overall_rating": [{"company": c, "rating": r} for c, r in ranked_by_rating],
}
return report
asyncio.run(run_glassdoor_intelligence_pipeline(
companies=TECH_COMPANIES,
proxy_url="http://USER:[email protected]:9000",
))
Key Takeaways for Glassdoor Scraping in 2026
- Glassdoor requires a logged-in session for anything beyond the first 1-2 results; create a free account, log in with Playwright once, and save the session state to disk
- DataDome protects all Glassdoor pages -- it checks TLS fingerprint, browser properties, and behavioral patterns; Playwright with stealth patches is required
- ThorData's residential proxy network handles DataDome's IP reputation scoring; each new browser context rotates to a fresh IP
- Per-IP rate limiting is strict: 3-7 second random delays, and create a new browser context per company to rotate IPs
- Session files expire after ~7 days -- build in re-authentication logic for long-running pipelines
- The GraphQL API interception approach is more resilient than CSS selector parsing -- class names change frequently, API response structure does not
- The most valuable use cases: salary benchmarking across companies, culture dimension sentiment analysis, and interview difficulty trend tracking