How to Scrape LinkedIn Job Postings in 2026: No Login Required
How to Scrape LinkedIn Job Postings in 2026: No Login Required
LinkedIn's public job search is one of the most useful datasets you can scrape without any authentication. Job titles, company names, locations, posted dates, salary ranges when listed — all available at linkedin.com/jobs/search without logging in.
This guide covers everything from the URL structure and HTML selectors to pagination handling, anti-bot evasion, salary normalization, and building a continuously-updated job market dataset.
Why Scrape LinkedIn Job Postings
LinkedIn is the largest job board in the world. Its public listings are a gold mine for:
- Salary research — Build a dataset of salary ranges by role, location, and seniority
- Job market analytics — Track hiring trends: which companies are growing, which skills are in demand
- Competitor intelligence — Monitor when competitors post engineering roles (signals new product development)
- Lead generation — Companies hiring for specific roles often need related services
- Academic research — Study labor market dynamics, skills evolution, remote work trends
- Personal job search — Build custom alerts with filters LinkedIn's UI does not support
How LinkedIn Public Job Search Works
LinkedIn exposes job search results at a clean URL structure:
https://www.linkedin.com/jobs/search/?keywords=python+developer&location=New+York&geoId=105080838&start=0
Key parameters:
keywords— job title or skill, URL-encodedlocation— human-readable location stringgeoId— LinkedIn's internal geographic IDf_TPR— time posted filter:r86400(24h),r604800(week),r2592000(month)f_E— experience level:1(internship),2(entry),3(associate),4(mid-senior),5(director)f_WT— work type:1(on-site),2(remote),3(hybrid)start— pagination offset, increments by 25
You do not need an API key or session token. LinkedIn renders job cards in the initial HTML response for unauthenticated users.
Common GeoIDs Reference
| Location | GeoID |
|---|---|
| United States | 103644278 |
| New York, NY | 105080838 |
| San Francisco Bay Area | 90000084 |
| London, UK | 90009496 |
| Berlin, Germany | 103035651 |
| Remote (Worldwide) | 92000000 |
| Toronto, Canada | 100025096 |
| Sydney, Australia | 104769905 |
Complete Working Scraper
#!/usr/bin/env python3
"""
LinkedIn public job scraper — no login required.
Usage: pip install requests beautifulsoup4 && python3 linkedin_jobs.py
"""
import requests
from bs4 import BeautifulSoup
import time
import random
import json
import csv
from pathlib import Path
from datetime import datetime
# Rotate headers to avoid fingerprinting
HEADERS_POOL = [
{
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 Chrome/124.0.0.0 Safari/537.36',
'Accept-Language': 'en-US,en;q=0.9',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Referer': 'https://www.google.com/',
},
{
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 Version/17.4 Safari/605.1.15',
'Accept-Language': 'en-GB,en;q=0.9',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Referer': 'https://www.linkedin.com/',
},
{
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 Chrome/123.0.0.0 Safari/537.36',
'Accept-Language': 'en-US,en;q=0.8',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Referer': 'https://www.google.com/search?q=linkedin+jobs',
},
]
def get_headers():
return random.choice(HEADERS_POOL)
def fetch_job_listings(
keywords: str,
location: str,
geo_id: str,
start: int = 0,
time_filter: str = None,
experience: str = None,
work_type: str = None,
proxies: dict = None,
) -> BeautifulSoup:
url = 'https://www.linkedin.com/jobs/search/'
params = {
'keywords': keywords,
'location': location,
'geoId': geo_id,
'start': start,
'position': 1,
'pageNum': 0,
}
if time_filter:
params['f_TPR'] = time_filter
if experience:
params['f_E'] = experience
if work_type:
params['f_WT'] = work_type
response = requests.get(
url,
params=params,
headers=get_headers(),
proxies=proxies,
timeout=15,
)
response.raise_for_status()
return BeautifulSoup(response.text, 'html.parser')
def parse_job_cards(soup: BeautifulSoup) -> list[dict]:
jobs = []
cards = soup.select('div.base-card[data-entity-urn]')
for card in cards:
urn = card.get('data-entity-urn', '')
job_id = urn.split(':')[-1] if urn else None
title_el = card.select_one('h3.base-search-card__title')
company_el = card.select_one('h4.base-search-card__subtitle a')
location_el = card.select_one('span.job-search-card__location')
date_el = card.select_one('time')
salary_el = card.select_one('span.job-search-card__salary-info')
jobs.append({
'id': job_id,
'title': title_el.get_text(strip=True) if title_el else None,
'company': company_el.get_text(strip=True) if company_el else None,
'location': location_el.get_text(strip=True) if location_el else None,
'posted': date_el.get('datetime') if date_el else None,
'salary': salary_el.get_text(strip=True) if salary_el else None,
'url': f'https://www.linkedin.com/jobs/view/{job_id}/' if job_id else None,
})
return jobs
def fetch_job_detail(job_id: str, proxies: dict = None) -> dict:
url = f'https://www.linkedin.com/jobs/view/{job_id}/'
response = requests.get(url, headers=get_headers(), proxies=proxies, timeout=15)
response.raise_for_status()
soup = BeautifulSoup(response.text, 'html.parser')
description_el = soup.select_one('div.show-more-less-html__markup')
criteria = {}
for item in soup.select('li.description__job-criteria-item'):
label = item.select_one('h3')
value = item.select_one('span')
if label and value:
criteria[label.get_text(strip=True)] = value.get_text(strip=True)
return {
'description': description_el.get_text(separator='\n', strip=True) if description_el else None,
'seniority_level': criteria.get('Seniority level'),
'employment_type': criteria.get('Employment type'),
'job_function': criteria.get('Job function'),
'industries': criteria.get('Industries'),
}
def scrape_all_pages(
keywords: str,
location: str,
geo_id: str,
max_pages: int = 5,
proxies: dict = None,
**filters,
) -> list[dict]:
all_jobs = []
for page in range(max_pages):
start = page * 25
print(f' Page {page + 1} (offset {start})...', end=' ')
try:
soup = fetch_job_listings(keywords, location, geo_id, start=start, proxies=proxies, **filters)
jobs = parse_job_cards(soup)
except requests.exceptions.HTTPError as e:
print(f'HTTP {e.response.status_code} — stopping')
break
if not jobs:
print('no results — stopping')
break
print(f'found {len(jobs)} jobs')
all_jobs.extend(jobs)
time.sleep(random.uniform(3.5, 7.0))
return all_jobs
Pagination Strategy
LinkedIn returns 25 results per page. Use start=0, start=25, start=50, and so on. In practice, results dry up after 10-15 pages regardless of what the UI says — LinkedIn caps public search at around 1000 listings per query.
To get more results, vary your search parameters:
# Instead of one big search, run multiple targeted searches
searches = [
{'keywords': 'python developer', 'geo_id': '105080838'}, # NYC
{'keywords': 'python developer', 'geo_id': '90000084'}, # SF Bay
{'keywords': 'python developer', 'geo_id': '92000000'}, # Remote
{'keywords': 'django developer', 'geo_id': '103644278'}, # US
{'keywords': 'fastapi engineer', 'geo_id': '103644278'}, # US
]
all_jobs = []
seen_ids = set()
for search in searches:
jobs = scrape_all_pages(
location='United States',
max_pages=5,
time_filter='r604800', # last week
**search,
)
for job in jobs:
if job['id'] not in seen_ids:
seen_ids.add(job['id'])
all_jobs.append(job)
time.sleep(random.uniform(10, 20)) # longer pause between searches
print(f'Collected {len(all_jobs)} unique jobs across all searches')
Anti-Bot Measures and How to Handle Them
LinkedIn is aggressive about bot detection:
| Behavior | Risk | What Happens |
|---|---|---|
| Fixed 1-second delays | High | 429 after ~5 pages |
| Same User-Agent for all requests | Medium | CAPTCHA challenge |
| Datacenter IP | High | Blocked after ~10 requests |
| 100+ requests/hour from one IP | High | Temporary IP ban |
| Fetching job details immediately after search | Medium | CAPTCHA on detail page |
| Same query repeated rapidly | High | Empty results returned |
What Works
Rotate headers — The script above uses a pool of 3 User-Agent and header combinations. Expand this pool for production.
Residential proxies — For serious LinkedIn scraping, ThorData's residential proxies are essential. Residential IPs rotate automatically and look like real users from different locations.
# ThorData residential proxy — https://thordata.partnerstack.com/partner/0a0x4nzb (or [Oxylabs](https://oxylabs.go2cloud.org/aff_c?offer_id=7&aff_id=2066&url_id=174))
PROXY_USER = 'your_username'
PROXY_PASS = 'your_password'
PROXY_HOST = 'proxy.thordata.net'
PROXY_PORT = 9000
proxies = {
'http': f'http://{PROXY_USER}:{PROXY_PASS}@{PROXY_HOST}:{PROXY_PORT}',
'https': f'http://{PROXY_USER}:{PROXY_PASS}@{PROXY_HOST}:{PROXY_PORT}',
}
# Pass to every request
response = requests.get(url, headers=get_headers(), proxies=proxies, timeout=20)
Realistic delays — Vary between 3.5 and 8 seconds between page requests. Add occasional longer pauses (15-30 seconds) every 5-10 requests.
Separate scraping phases — Collect all job card data first in one pass, then fetch details in a second pass with longer delays.
Respect the 429 — If you get a 429 status code, back off for at least 60 seconds:
def safe_request(url: str, proxies: dict = None, max_retries: int = 3) -> requests.Response:
backoff = 2
for attempt in range(max_retries):
resp = requests.get(url, headers=get_headers(), proxies=proxies, timeout=15)
if resp.status_code == 429:
wait = int(resp.headers.get('Retry-After', 60))
print(f' Rate limited — waiting {wait}s')
time.sleep(wait)
continue
resp.raise_for_status()
return resp
raise Exception(f'Failed after {max_retries} attempts')
Building a Salary Database
One high-value use case is building a salary comparison database from LinkedIn listings that include compensation data:
import re
import statistics
def normalize_salary(raw: str) -> dict | None:
if not raw:
return None
raw_lower = raw.lower()
numbers = re.findall(r'[\d,]+', raw.replace(',', ''))
if not numbers:
return None
amounts = [int(n) for n in numbers if n]
if not amounts:
return None
# Determine period and annualize
is_hourly = 'hr' in raw_lower or 'hour' in raw_lower
is_monthly = 'month' in raw_lower or '/mo' in raw_lower
def annualize(amt):
if is_hourly: return amt * 2080
if is_monthly: return amt * 12
return amt
annual = [annualize(a) for a in amounts]
return {
'min': min(annual),
'max': max(annual),
'mid': sum(annual) / len(annual),
'raw': raw,
'period': 'hourly' if is_hourly else 'monthly' if is_monthly else 'annual',
}
def extract_salary_data(jobs: list[dict]) -> list[dict]:
salary_jobs = []
for job in jobs:
s = normalize_salary(job.get('salary'))
if s:
salary_jobs.append({
'title': job['title'],
'company': job['company'],
'location': job['location'],
'salary_min': s['min'],
'salary_max': s['max'],
'salary_mid': s['mid'],
'salary_raw': s['raw'],
'url': job.get('url'),
})
return salary_jobs
def print_salary_report(salary_jobs: list[dict]) -> None:
if not salary_jobs:
print('No salary data found')
return
mids = [j['salary_mid'] for j in salary_jobs]
print(f'Salary report: {len(salary_jobs)} jobs with salary data')
print(f' Min: ${min(j["salary_min"] for j in salary_jobs):>9,.0f}')
print(f' Max: ${max(j["salary_max"] for j in salary_jobs):>9,.0f}')
print(f' Median: ${statistics.median(mids):>9,.0f}')
print(f' Mean: ${statistics.mean(mids):>9,.0f}')
print()
print('Top 10 by salary:')
for job in sorted(salary_jobs, key=lambda x: x['salary_max'], reverse=True)[:10]:
print(f" ${job['salary_min']:>7,.0f} - ${job['salary_max']:>7,.0f} {job['title']} at {job['company']}")
Remote Job Filtering
LinkedIn's work type filter (f_WT=2) isolates remote positions:
remote_jobs = scrape_all_pages(
keywords='data engineer',
location='United States',
geo_id='103644278',
max_pages=8,
work_type='2', # 2 = remote
time_filter='r604800', # past week
)
print(f'Found {len(remote_jobs)} remote positions')
# Separate filter for hybrid
hybrid_jobs = scrape_all_pages(
keywords='data engineer',
location='New York',
geo_id='105080838',
max_pages=4,
work_type='3', # 3 = hybrid
)
print(f'Found {len(hybrid_jobs)} hybrid positions in NYC')
Saving Results
import json, csv
from pathlib import Path
from datetime import datetime
def save_results(jobs: list[dict], prefix: str = 'linkedin_jobs') -> None:
if not jobs:
return
timestamp = datetime.now().strftime('%Y%m%d_%H%M')
# JSON
json_file = Path(f'{prefix}_{timestamp}.json')
json_file.write_text(json.dumps(jobs, indent=2, ensure_ascii=False))
# CSV
csv_file = Path(f'{prefix}_{timestamp}.csv')
fieldnames = ['id', 'title', 'company', 'location', 'salary', 'posted', 'url']
with open(csv_file, 'w', newline='', encoding='utf-8') as f:
writer = csv.DictWriter(f, fieldnames=fieldnames, extrasaction='ignore')
writer.writeheader()
writer.writerows(jobs)
print(f'Saved {len(jobs)} jobs to {json_file} and {csv_file}')
Keyword-Specific Job Monitoring
Set up monitoring for specific keywords to catch new listings as they appear:
import json
from pathlib import Path
from datetime import datetime
MONITOR_STATE = Path('linkedin_monitor_state.json')
def load_seen_jobs() -> set:
if MONITOR_STATE.exists():
data = json.loads(MONITOR_STATE.read_text())
return set(data.get('seen_ids', []))
return set()
def save_seen_jobs(seen_ids: set) -> None:
MONITOR_STATE.write_text(json.dumps({'seen_ids': list(seen_ids)}, indent=2))
def monitor_jobs(
keywords: str,
location: str,
geo_id: str,
alert_file: str = 'new_jobs.json',
) -> list[dict]:
seen_ids = load_seen_jobs()
jobs = scrape_all_pages(
keywords, location, geo_id,
max_pages=4,
time_filter='r86400', # last 24 hours only
)
new_jobs = [j for j in jobs if j.get('id') and j['id'] not in seen_ids]
if new_jobs:
# Update seen IDs
for job in new_jobs:
seen_ids.add(job['id'])
save_seen_jobs(seen_ids)
# Save new jobs
out = Path(alert_file)
existing = json.loads(out.read_text()) if out.exists() else []
existing.extend(new_jobs)
out.write_text(json.dumps(existing, indent=2))
print(f'Found {len(new_jobs)} new listings for "{keywords}"')
else:
print(f'No new listings for "{keywords}"')
return new_jobs
# Run daily to catch new listings
monitor_jobs('machine learning engineer', 'Remote', '92000000')
Industry Analytics
With a large dataset, you can analyze hiring patterns across industries:
from collections import Counter
import re
def analyze_job_dataset(jobs: list[dict]) -> dict:
# Company hiring frequency
companies = Counter(j['company'] for j in jobs if j.get('company'))
# Location distribution
locations = Counter(j['location'] for j in jobs if j.get('location'))
# Salary coverage
with_salary = [j for j in jobs if j.get('salary')]
salary_rate = len(with_salary) / len(jobs) if jobs else 0
# Posted date distribution
today_count = sum(1 for j in jobs if j.get('posted') and datetime.now().strftime('%Y-%m-%d') in j['posted'])
print(f'Dataset: {len(jobs)} jobs')
print(f' Salary disclosed: {salary_rate:.1%}')
print(f' Posted today: {today_count}')
print(f' Unique companies: {len(companies)}')
print(f' Unique locations: {len(locations)}')
print()
print('Top hiring companies:')
for company, count in companies.most_common(10):
print(f' {count:>4} postings: {company}')
print()
print('Top locations:')
for location, count in locations.most_common(10):
print(f' {count:>4}: {location}')
return {
'total_jobs': len(jobs),
'salary_coverage': salary_rate,
'top_companies': dict(companies.most_common(20)),
'top_locations': dict(locations.most_common(20)),
}
Summary
LinkedIn public job search is accessible without authentication. The URL parameters are clean, the HTML structure is consistent enough to parse reliably, and you can get title, company, location, posted date, salary (when listed), and full descriptions from two endpoint types.
The limiting factor is rate limiting — residential proxies and realistic delays are what separate a scraper that runs for ten minutes from one that runs all day. Use ThorData's residential proxy pool to distribute requests across different IPs. Start conservative on delays (5+ seconds between requests), monitor your response codes, and increase throughput only once you have the proxy layer working.
For ongoing monitoring, combine the time filter parameters with incremental state tracking to catch only new listings each run — this minimizes request volume while keeping your dataset current.
Skills Demand Analysis
One of the most actionable insights from LinkedIn job data is understanding which skills are in demand. Analyze job descriptions to identify skill frequency:
import json
import re
from pathlib import Path
from collections import Counter
# Common tech skills to look for
SKILL_PATTERNS = {
"Python": r"\bpython\b",
"JavaScript": r"\bjavascript\b|\bjs\b",
"TypeScript": r"\btypescript\b",
"React": r"\breact\.?js\b|\breactjs\b|\breact\b",
"Node.js": r"\bnode\.?js\b",
"AWS": r"\baws\b|\bamazon web services\b",
"Docker": r"\bdocker\b",
"Kubernetes": r"\bkubernetes\b|\bk8s\b",
"SQL": r"\bsql\b",
"PostgreSQL": r"\bpostgresql\b|\bpostgres\b",
"Machine Learning": r"\bmachine learning\b|\bml\b",
"Data Science": r"\bdata science\b|\bdata scientist\b",
"FastAPI": r"\bfastapi\b",
"Django": r"\bdjango\b",
"Rust": r"\brust\b",
"Go": r"\bgolang\b|\bgo lang\b",
"Java": r"\bjava\b",
"C++": r"\bc\+\+\b|\bcpp\b",
"Terraform": r"\bterraform\b",
"Redis": r"\bredis\b",
"Spark": r"\bapache spark\b|\bpyspark\b|\bspark\b",
}
def extract_skills(description: str) -> list[str]:
if not description:
return []
desc_lower = description.lower()
found = []
for skill, pattern in SKILL_PATTERNS.items():
if re.search(pattern, desc_lower, re.IGNORECASE):
found.append(skill)
return found
def analyze_skills_demand(jobs: list[dict]) -> dict:
skill_counts = Counter()
job_count = 0
for job in jobs:
description = job.get("description", "")
if not description:
continue
skills = extract_skills(description)
for skill in skills:
skill_counts[skill] += 1
job_count += 1
if job_count == 0:
return {}
# Convert to percentage of jobs mentioning each skill
skill_percentages = {
skill: round(count / job_count * 100, 1)
for skill, count in skill_counts.items()
}
print(f"\nSkills demand analysis ({job_count} job descriptions):")
print(f"{'Skill':<20} {'Frequency':>10} {'% of Jobs':>10}")
print("-" * 44)
for skill, pct in sorted(skill_percentages.items(), key=lambda x: x[1], reverse=True):
count = skill_counts[skill]
print(f"{skill:<20} {count:>10} {pct:>9.1f}%")
return {
"job_count": job_count,
"skill_counts": dict(skill_counts),
"skill_percentages": skill_percentages,
}
Seniority Level Distribution
Understanding how LinkedIn distributes listings across seniority levels reveals market conditions:
from collections import Counter, defaultdict
import statistics
def analyze_seniority_distribution(jobs: list[dict]) -> dict:
seniority_levels = Counter(
j.get("seniority_level", "Not specified")
for j in jobs
if j.get("seniority_level")
)
# Salary by seniority (if available)
salary_by_seniority = defaultdict(list)
for job in jobs:
level = job.get("seniority_level")
salary_data = job.get("salary_normalized")
if level and salary_data and salary_data.get("mid"):
salary_by_seniority[level].append(salary_data["mid"])
print(f"\nSeniority level distribution ({len(jobs)} jobs):")
print(f"{'Level':<25} {'Count':>7} {'% Total':>8} {'Avg Salary':>12}")
print("-" * 56)
total = sum(seniority_levels.values())
for level, count in seniority_levels.most_common():
pct = count / total * 100
salaries = salary_by_seniority.get(level, [])
avg_sal = f"${statistics.mean(salaries):>9,.0f}" if salaries else "N/A"
print(f"{level:<25} {count:>7} {pct:>7.1f}% {avg_sal:>12}")
return {
"distribution": dict(seniority_levels),
"salary_by_seniority": {
level: {
"count": len(salaries),
"avg": statistics.mean(salaries) if salaries else None,
"median": statistics.median(salaries) if salaries else None,
}
for level, salaries in salary_by_seniority.items()
},
}
Company Technology Stack Detection
When companies post multiple jobs, you can infer their tech stack from the collective requirements:
import json
from pathlib import Path
from collections import defaultdict
def infer_company_tech_stacks(jobs: list[dict]) -> dict:
company_skills = defaultdict(Counter)
for job in jobs:
company = job.get("company")
description = job.get("description", "")
if not company or not description:
continue
skills = extract_skills(description)
for skill in skills:
company_skills[company][skill] += 1
# Find companies with the most job postings for confidence
result = {}
for company, skill_counter in company_skills.items():
total_jobs = sum(skill_counter.values()) // max(len(SKILL_PATTERNS) // 3, 1)
top_skills = [skill for skill, _ in skill_counter.most_common(10)]
result[company] = {
"detected_skills": top_skills,
"skill_frequency": dict(skill_counter.most_common(10)),
}
# Print companies with clearest tech stacks
print("Company tech stack analysis:")
sorted_companies = sorted(
result.items(),
key=lambda x: sum(x[1]["skill_frequency"].values()),
reverse=True,
)
for company, data in sorted_companies[:10]:
skills_str = ", ".join(data["detected_skills"][:5])
print(f" {company}: {skills_str}")
return result
LinkedIn Easy Apply Detection
Some jobs use LinkedIn's "Easy Apply" feature. Detecting this from the HTML can help filter listings:
from bs4 import BeautifulSoup
import requests
import random
import time
def check_easy_apply(job_id: str, proxies: dict = None) -> bool:
url = f"https://www.linkedin.com/jobs/view/{job_id}/"
resp = requests.get(url, headers=get_headers(), proxies=proxies, timeout=15)
if resp.status_code != 200:
return False
soup = BeautifulSoup(resp.text, "html.parser")
# Easy Apply button has a specific class
easy_apply = soup.find("span", string=lambda s: s and "Easy Apply" in s)
return easy_apply is not None
def scrape_easy_apply_jobs(
keywords: str,
geo_id: str,
max_pages: int = 5,
proxies: dict = None,
) -> list[dict]:
jobs = scrape_all_pages(
keywords=keywords,
location="",
geo_id=geo_id,
max_pages=max_pages,
proxies=proxies,
)
easy_apply_jobs = []
print(f"Checking {len(jobs)} jobs for Easy Apply...")
for i, job in enumerate(jobs):
if job.get("id"):
is_easy = check_easy_apply(job["id"], proxies=proxies)
if is_easy:
job["easy_apply"] = True
easy_apply_jobs.append(job)
time.sleep(random.uniform(3, 6))
if (i + 1) % 10 == 0:
print(f" Checked {i+1}/{len(jobs)}, found {len(easy_apply_jobs)} Easy Apply")
return easy_apply_jobs
Tracking Hiring Velocity
By monitoring how quickly new jobs appear in a category, you can measure hiring velocity — a leading indicator of company growth or contraction:
import json
from pathlib import Path
from datetime import datetime, timedelta
from collections import defaultdict
VELOCITY_DB = Path("linkedin_velocity.json")
def load_velocity_data() -> dict:
if VELOCITY_DB.exists():
return json.loads(VELOCITY_DB.read_text())
return {}
def record_job_snapshot(
keywords: str,
geo_id: str,
location: str = "United States",
proxies: dict = None,
) -> dict:
velocity_data = load_velocity_data()
now = datetime.now().isoformat()
# Get jobs posted in the last 24 hours
jobs = scrape_all_pages(
keywords=keywords,
location=location,
geo_id=geo_id,
max_pages=4,
time_filter="r86400",
proxies=proxies,
)
companies = defaultdict(int)
for job in jobs:
if job.get("company"):
companies[job["company"]] += 1
key = f"{keywords}_{geo_id}"
if key not in velocity_data:
velocity_data[key] = []
velocity_data[key].append({
"timestamp": now,
"total_new_jobs": len(jobs),
"companies": dict(companies),
})
VELOCITY_DB.write_text(json.dumps(velocity_data, indent=2))
print(f"Snapshot: {len(jobs)} new jobs for '{keywords}' in 24h")
for company, count in sorted(companies.items(), key=lambda x: x[1], reverse=True)[:5]:
print(f" {company}: {count} new postings")
return {"total": len(jobs), "companies": dict(companies)}
def analyze_velocity_trend(keywords: str, geo_id: str, days: int = 30) -> list[dict]:
velocity_data = load_velocity_data()
key = f"{keywords}_{geo_id}"
snapshots = velocity_data.get(key, [])
if not snapshots:
return []
cutoff = (datetime.now() - timedelta(days=days)).isoformat()
return [s for s in snapshots if s["timestamp"] >= cutoff]
Building a Job Board Dashboard
All these capabilities combine into a personal job board dashboard that surfaces opportunities you would otherwise miss:
#!/usr/bin/env python3
"""
LinkedIn job intelligence dashboard.
Runs daily to collect listings, analyze skills demand, and surface insights.
"""
import json
import time
import random
from pathlib import Path
from datetime import datetime
# ThorData residential proxy — https://thordata.partnerstack.com/partner/0a0x4nzb (or [Oxylabs](https://oxylabs.go2cloud.org/aff_c?offer_id=7&aff_id=2066&url_id=174))
PROXY_USER = "your_thordata_user"
PROXY_PASS = "your_thordata_pass"
PROXIES = {
"http": f"http://{PROXY_USER}:{PROXY_PASS}@proxy.thordata.net:9000",
"https": f"http://{PROXY_USER}:{PROXY_PASS}@proxy.thordata.net:9000",
}
# Target searches
SEARCHES = [
{"keywords": "python developer", "location": "Remote", "geo_id": "92000000"},
{"keywords": "backend engineer", "location": "Remote", "geo_id": "92000000"},
{"keywords": "data engineer", "location": "Remote", "geo_id": "92000000"},
{"keywords": "machine learning engineer", "location": "San Francisco Bay Area", "geo_id": "90000084"},
]
OUTPUT_DIR = Path("job_dashboard")
OUTPUT_DIR.mkdir(exist_ok=True)
def run_daily_collection() -> None:
print(f"=== LinkedIn Daily Job Collection ===")
print(f"Started: {datetime.now().strftime('%Y-%m-%d %H:%M')}")
print()
all_jobs = []
seen_ids = set()
for search in SEARCHES:
kw = search["keywords"]
loc = search["location"]
print(f"Collecting: {kw} in {loc}")
jobs = scrape_all_pages(
keywords=kw,
location=loc,
geo_id=search["geo_id"],
max_pages=4,
proxies=PROXIES,
time_filter="r86400", # last 24 hours
)
for job in jobs:
if job.get("id") and job["id"] not in seen_ids:
seen_ids.add(job["id"])
job["search_query"] = kw
all_jobs.append(job)
print(f" Collected {len(jobs)} jobs")
time.sleep(random.uniform(10, 20))
print(f"\nTotal unique jobs: {len(all_jobs)}")
# Enrich top jobs with details
print("\nEnriching top jobs with full descriptions...")
jobs_to_enrich = sorted(all_jobs, key=lambda j: j.get("salary") is not None, reverse=True)[:30]
for i, job in enumerate(jobs_to_enrich):
if job.get("id"):
try:
detail = fetch_job_detail(job["id"], proxies=PROXIES)
job.update(detail)
job["salary_normalized"] = normalize_salary(job.get("salary"))
except Exception as e:
print(f" Detail fetch failed: {e}")
if (i + 1) % 5 == 0:
print(f" Enriched {i+1}/{len(jobs_to_enrich)}")
time.sleep(random.uniform(4, 8))
# Run analytics
print("\nRunning analytics...")
skills = analyze_skills_demand(all_jobs)
seniority = analyze_seniority_distribution(all_jobs)
salary_jobs = extract_salary_data(all_jobs)
# Save everything
timestamp = datetime.now().strftime("%Y%m%d_%H%M")
jobs_file = OUTPUT_DIR / f"jobs_{timestamp}.json"
jobs_file.write_text(json.dumps(all_jobs, indent=2))
analytics_file = OUTPUT_DIR / f"analytics_{timestamp}.json"
analytics_file.write_text(json.dumps({
"timestamp": datetime.now().isoformat(),
"total_jobs": len(all_jobs),
"skills_demand": skills,
"seniority_distribution": seniority,
"salary_count": len(salary_jobs),
}, indent=2))
save_results(all_jobs, prefix=str(OUTPUT_DIR / f"jobs_{timestamp}"))
print(f"\n=== Collection Complete ===")
print(f"Jobs collected: {len(all_jobs)}")
print(f"With salary data: {len(salary_jobs)}")
print(f"Files saved to: {OUTPUT_DIR}/")
if __name__ == "__main__":
run_daily_collection()
Key Patterns for Reliable LinkedIn Scraping
After extensive testing, these patterns consistently produce reliable results:
Session-independent scraping — LinkedIn's public job search does not require cookies or session tokens. Avoid sending cookies at all — it can actually increase detection risk if the cookie is flagged.
Two-phase collection — Always collect job card metadata (title, company, location, salary) in one pass, then fetch full descriptions in a separate pass. The different timing pattern looks more human and lets you save partial results if you get blocked.
Exponential backoff on 429s — A 429 from LinkedIn means you hit the rate limit. Wait at least 60 seconds, then 120s, then 240s. Do not retry immediately.
IP diversity via ThorData — ThorData's residential proxy pool provides IPs from real home connections across the country. LinkedIn's detection system specifically watches for requests originating from data center IP ranges (AWS, GCP, Azure, DigitalOcean). Residential IPs bypass this check entirely.
Store raw HTML as backup — When scraping at scale, save the raw HTML response alongside parsed data. If LinkedIn changes its HTML structure, you can re-parse from stored HTML without making new requests.
Job Description NLP: Extracting Structured Requirements
LinkedIn job descriptions contain rich requirement data buried in unstructured text. Extract structured fields programmatically:
import re
from typing import Optional
EXPERIENCE_PATTERNS = [
r'(\d+)\+?\s*(?:years?|yrs?)\s+(?:of\s+)?(?:experience|exp)',
r'(?:minimum|at least|\d+)\s*(\d+)\s*(?:to|-)\s*(\d+)\s*years?',
r'(entry.?level|mid.?level|senior|staff|principal|lead|junior)',
]
EDUCATION_PATTERNS = [
r"\b(bachelor'?s?|master'?s?|phd|doctorate|associate'?s?)\s+(?:degree|of science|of arts)?",
r"\b(bs|ms|mba|phd|ba|ma)\b",
]
WORK_LOCATION_PATTERNS = {
"remote": r"\b(fully remote|100%\s*remote|remote.?only|work from anywhere|distributed team)\b",
"hybrid": r"\b(hybrid|\d+\s*days?\s*(?:in|per\s+week|onsite|in.?office))\b",
"onsite": r"\b(on.?site|in.?office|in.?person|must be located|must reside|relocation)\b",
}
def extract_requirements(description: str) -> dict:
if not description:
return {}
desc_lower = description.lower()
# Experience requirements
experience_years = None
for pattern in EXPERIENCE_PATTERNS:
match = re.search(pattern, desc_lower)
if match:
if match.lastindex and match.lastindex >= 1:
try:
years_str = match.group(1)
if years_str.isdigit():
experience_years = int(years_str)
elif years_str in ("entry-level", "entry level", "junior"):
experience_years = 0
elif years_str in ("mid-level", "mid level"):
experience_years = 3
elif years_str in ("senior", "lead"):
experience_years = 5
except (IndexError, ValueError):
pass
break
# Education requirements
education = []
for pattern in EDUCATION_PATTERNS:
matches = re.findall(pattern, desc_lower, re.IGNORECASE)
education.extend(matches)
education = list(set(education)) if education else []
# Work arrangement
work_arrangement = "unspecified"
for arrangement, pattern in WORK_LOCATION_PATTERNS.items():
if re.search(pattern, desc_lower, re.IGNORECASE):
work_arrangement = arrangement
break
# Benefits extraction
benefit_patterns = {
"health_insurance": r"\b(health insurance|medical|dental|vision)\b",
"retirement_401k": r"\b(401k|401\(k\)|retirement|pension|matching)\b",
"pto": r"\b(pto|paid time off|vacation|holidays?|unlimited pto)\b",
"equity": r"\b(equity|stock options?|rsu|espp|vesting)\b",
"remote_stipend": r"\b(home office|remote stipend|equipment|internet stipend)\b",
"parental_leave": r"\b(parental leave|maternity|paternity|family leave)\b",
}
benefits = [benefit for benefit, pattern in benefit_patterns.items()
if re.search(pattern, desc_lower, re.IGNORECASE)]
return {
"experience_years_required": experience_years,
"education_requirements": education,
"work_arrangement": work_arrangement,
"benefits_mentioned": benefits,
}
def enrich_jobs_with_requirements(jobs: list[dict]) -> list[dict]:
for job in jobs:
if job.get("description"):
reqs = extract_requirements(job["description"])
job.update(reqs)
return jobs
Advanced Pagination and Deduplication
LinkedIn's search returns duplicates across different keyword searches. Build robust deduplication:
import json
from pathlib import Path
from collections import defaultdict
class DeduplicatedJobCollection:
def __init__(self, state_file: str = "job_dedup_state.json"):
self.state_file = Path(state_file)
self._load()
def _load(self) -> None:
if self.state_file.exists():
data = json.loads(self.state_file.read_text())
self.seen_ids = set(data.get("seen_ids", []))
self.jobs = data.get("jobs", [])
else:
self.seen_ids = set()
self.jobs = []
def save(self) -> None:
self.state_file.write_text(json.dumps({
"seen_ids": list(self.seen_ids),
"jobs": self.jobs,
}, indent=2))
def add(self, new_jobs: list[dict]) -> int:
added = 0
for job in new_jobs:
job_id = job.get("id")
if job_id and job_id not in self.seen_ids:
self.seen_ids.add(job_id)
self.jobs.append(job)
added += 1
if added:
self.save()
return added
def get_all(self) -> list[dict]:
return self.jobs
def get_with_salary(self) -> list[dict]:
return [j for j in self.jobs if j.get("salary")]
def get_by_seniority(self, level: str) -> list[dict]:
return [j for j in self.jobs if j.get("seniority_level") == level]
def export_csv(self, output_path: str) -> None:
import csv
if not self.jobs:
return
fieldnames = ["id", "title", "company", "location", "salary", "posted",
"seniority_level", "employment_type", "work_arrangement", "url"]
with open(output_path, "w", newline="", encoding="utf-8") as f:
writer = csv.DictWriter(f, fieldnames=fieldnames, extrasaction="ignore")
writer.writeheader()
writer.writerows(self.jobs)
print(f"Exported {len(self.jobs)} jobs to {output_path}")
Finding Fully Remote High-Paying Roles
The highest-value query for many users: combining salary floor, remote work, and seniority filters:
import time
import random
def find_remote_senior_roles(
tech_stack: list[str],
min_salary_usd: int = 150_000,
proxies: dict = None,
) -> list[dict]:
collection = DeduplicatedJobCollection("remote_senior_jobs.json")
for tech in tech_stack:
print(f"Searching: remote {tech} roles...")
# Remote filter (f_WT=2) + senior level (f_E=4)
jobs = scrape_all_pages(
keywords=tech,
location="Remote",
geo_id="92000000",
max_pages=8,
work_type="2", # remote
experience="4", # mid-senior
time_filter="r604800", # last week
proxies=proxies,
)
# Enrich with details and requirements
detail_jobs = []
for job in jobs[:20]: # limit detail fetches
if job.get("id"):
try:
detail = fetch_job_detail(job["id"], proxies=proxies)
job.update(detail)
reqs = extract_requirements(job.get("description", ""))
job.update(reqs)
except Exception:
pass
detail_jobs.append(job)
time.sleep(random.uniform(3, 6))
# Filter by salary
high_paying = [j for j in detail_jobs if j.get("salary")]
for job in high_paying:
salary = normalize_salary(job.get("salary"))
job["salary_normalized"] = salary
if salary and salary.get("min", 0) >= min_salary_usd:
pass # keep
collection.add(high_paying)
added = collection.add([j for j in high_paying
if j.get("salary_normalized", {}) and
j["salary_normalized"].get("min", 0) >= min_salary_usd])
print(f" Added {added} high-paying remote {tech} roles")
time.sleep(random.uniform(10, 15))
results = [j for j in collection.get_all()
if j.get("salary_normalized") and
j["salary_normalized"].get("min", 0) >= min_salary_usd]
print(f"\nFound {len(results)} remote senior roles with ${min_salary_usd:,}+ salary")
for job in sorted(results, key=lambda j: j.get("salary_normalized", {}).get("mid", 0), reverse=True)[:10]:
sal = job.get("salary_normalized", {})
sal_str = f"${sal.get('min', 0):,.0f}-${sal.get('max', 0):,.0f}" if sal else "N/A"
print(f" {sal_str:<25} {job['title']} at {job['company']}")
return results
# Example usage:
# proxies = {"http": "http://USER:[email protected]:9000",
# "https": "http://USER:[email protected]:9000"}
# roles = find_remote_senior_roles(
# tech_stack=["python", "golang", "rust", "data engineering"],
# min_salary_usd=160_000,
# proxies=proxies,
# )
Long-Term Market Trend Analysis
By running the scraper weekly over months, you can build a dataset that reveals macro trends in the job market:
import json
import statistics
from pathlib import Path
from datetime import datetime, timedelta
from collections import defaultdict
def analyze_market_trend(
data_dir: str,
keyword: str,
months_back: int = 6,
) -> list[dict]:
data_path = Path(data_dir)
all_files = sorted(data_path.glob("*.json"))
cutoff = datetime.now() - timedelta(days=months_back * 30)
timeline_data = []
for f in all_files:
try:
file_date = datetime.strptime(f.stem.split("_")[-2], "%Y%m%d")
if file_date < cutoff:
continue
except (ValueError, IndexError):
continue
jobs = json.loads(f.read_text())
if not isinstance(jobs, list):
jobs = jobs.get("jobs", [])
# Filter to relevant keyword
relevant = [j for j in jobs if keyword.lower() in (j.get("title") or "").lower()]
if not relevant:
continue
salaries = [normalize_salary(j.get("salary")) for j in relevant]
salaries = [s for s in salaries if s and s.get("mid")]
salary_vals = [s["mid"] for s in salaries]
remote_count = sum(1 for j in relevant
if j.get("work_arrangement") == "remote" or
(j.get("location") and "remote" in j["location"].lower()))
timeline_data.append({
"date": file_date.strftime("%Y-%m-%d"),
"total_listings": len(relevant),
"with_salary": len(salary_vals),
"avg_salary": statistics.mean(salary_vals) if salary_vals else None,
"remote_pct": remote_count / len(relevant) * 100 if relevant else 0,
})
print(f"\n{keyword} job market trend (past {months_back} months):")
print(f"{'Date':<12} {'Listings':>9} {'Avg Salary':>12} {'Remote%':>8}")
print("-" * 44)
for point in timeline_data:
sal = f"${point['avg_salary']:>9,.0f}" if point["avg_salary"] else " N/A"
print(f"{point['date']:<12} {point['total_listings']:>9} {sal:>12} {point['remote_pct']:>7.1f}%")
return timeline_data
Quick Reference: LinkedIn Job Search URL Parameters
For fast experimentation without writing code, here is a complete reference of the most useful LinkedIn job search URL parameters:
| Parameter | Values | Description |
|---|---|---|
keywords |
URL-encoded string | Job title or skill keywords |
location |
City, State or Country | Geographic filter |
geoId |
Numeric ID | LinkedIn's internal location ID |
f_TPR |
r86400 (24h), r604800 (7d), r2592000 (30d) |
Date posted filter |
f_WT |
1 (on-site), 2 (remote), 3 (hybrid) |
Work type |
f_E |
1 (intern), 2 (entry), 3 (associate), 4 (mid-senior), 5 (director), 6 (exec) |
Experience level |
f_JT |
F (full-time), P (part-time), C (contract), T (temp), I (intern) |
Job type |
f_C |
Numeric company ID | Filter by company |
start |
Multiple of 25 | Pagination offset |
sortBy |
DD (most recent), R (most relevant) |
Sort order |
Example — remote senior Python jobs posted in the last 7 days:
https://www.linkedin.com/jobs/search/?keywords=python&f_WT=2&f_E=4&f_TPR=r604800&sortBy=DD
Use geoId for precise location targeting (e.g., geoId=102571732 for United States). Find a location's geoId by inspecting the URL after manually searching on LinkedIn.
Combine with ThorData residential proxies to scrape these URLs at scale without triggering LinkedIn's bot detection.