Advanced LinkedIn Profile Scraping Techniques (2026)
Advanced LinkedIn Profile Scraping Techniques (2026)
LinkedIn is probably the most legally contentious scraping target out there. The hiQ v. LinkedIn case established that scraping public data isn't a CFAA violation, but LinkedIn still aggressively blocks scrapers and will terminate accounts they catch. So tread carefully.
That said, LinkedIn's data is incredibly valuable for recruiters, researchers, and sales teams. And their official API is so restricted it's almost useless — you can barely access your own profile data through it.
The real power is in LinkedIn's Voyager API — the internal REST API that powers the web app.
Understanding the Voyager API
When you browse LinkedIn in your browser, every profile load, search, and interaction goes through their Voyager API at https://www.linkedin.com/voyager/api/. Open your browser's network tab on LinkedIn and you'll see hundreds of these requests.
The key endpoints we care about:
/identity/profiles/{vanityName}— basic profile info/identity/profiles/{vanityName}/skills— skills and endorsements/identity/profiles/{vanityName}/profileContactInfo— contact details/identity/dash/profiles— the newer "dash" API with richer data/search/dash/clusters— people search/identity/profiles/{vanityName}/positions— work experience/identity/profiles/{vanityName}/educations— education history
Each endpoint returns normalized JSON where the actual data is split across data and included arrays. The included array contains all referenced objects — profile components, companies, schools — keyed by their URN identifiers.
Authentication Setup
You need a valid LinkedIn session. The Voyager API uses two tokens: li_at (session cookie) and a CSRF token extracted from JSESSIONID:
import httpx
import time
import random
class LinkedInScraper:
BASE_URL = "https://www.linkedin.com/voyager/api"
def __init__(self, li_at_cookie: str, proxy_url: str | None = None):
transport = httpx.HTTPTransport(proxy=proxy_url) if proxy_url else None
self.session = httpx.Client(
headers={
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
"AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/125.0.0.0 Safari/537.36",
"Accept": "application/vnd.linkedin.normalized+json+2.1",
"Accept-Language": "en-US,en;q=0.9",
"X-Li-Lang": "en_US",
"X-Li-Page-Instance": "urn:li:page:d_flagship3_profile_view_base",
"X-Restli-Protocol-Version": "2.0.0",
},
cookies={"li_at": li_at_cookie},
transport=transport,
timeout=20,
follow_redirects=True,
)
self._set_csrf_token()
def _set_csrf_token(self):
"""Extract CSRF token from session cookies."""
resp = self.session.get("https://www.linkedin.com/feed/")
csrf = self.session.cookies.get("JSESSIONID", "").strip('"')
self.session.headers["Csrf-Token"] = csrf
def get_profile(self, vanity_name: str) -> dict:
"""Fetch full profile data via the dash API."""
url = f"{self.BASE_URL}/identity/dash/profiles"
params = {
"q": "memberIdentity",
"memberIdentity": vanity_name,
"decorationId": (
"com.linkedin.voyager.dash.deco.identity.profile"
".FullProfileWithEntities-93"
),
}
resp = self.session.get(url, params=params)
resp.raise_for_status()
return resp.json()
def get_skills(self, vanity_name: str) -> list[dict]:
"""Fetch skills and endorsement counts."""
url = f"{self.BASE_URL}/identity/profiles/{vanity_name}/skills"
params = {"count": 100, "start": 0}
resp = self.session.get(url, params=params)
resp.raise_for_status()
data = resp.json()
skills = []
for element in data.get("elements", []):
skills.append({
"name": element.get("name", ""),
"endorsement_count": element.get("endorsementCount", 0),
})
return skills
def get_contact_info(self, vanity_name: str) -> dict:
"""Fetch email, phone, Twitter, websites."""
url = (f"{self.BASE_URL}/identity/profiles/"
f"{vanity_name}/profileContactInfo")
resp = self.session.get(url)
resp.raise_for_status()
data = resp.json()
return {
"email": data.get("emailAddress"),
"phone": [p.get("number") for p in data.get("phoneNumbers", [])],
"twitter": [t.get("name") for t in data.get("twitterHandles", [])],
"websites": [w.get("url") for w in data.get("websites", [])],
}
def get_positions(self, vanity_name: str) -> list[dict]:
"""Fetch work experience / position history."""
url = f"{self.BASE_URL}/identity/profiles/{vanity_name}/positions"
resp = self.session.get(url, params={"count": 50, "start": 0})
resp.raise_for_status()
data = resp.json()
positions = []
for el in data.get("elements", []):
date_range = el.get("dateRange", {})
start_dt = date_range.get("start", {})
end_dt = date_range.get("end", {})
positions.append({
"title": el.get("title", ""),
"company": el.get("companyName", ""),
"start_year": start_dt.get("year"),
"start_month": start_dt.get("month"),
"end_year": end_dt.get("year"),
"end_month": end_dt.get("month"),
"is_current": end_dt == {},
"description": el.get("description", ""),
"location": el.get("locationName", ""),
})
return positions
def get_educations(self, vanity_name: str) -> list[dict]:
"""Fetch education history."""
url = f"{self.BASE_URL}/identity/profiles/{vanity_name}/educations"
resp = self.session.get(url)
resp.raise_for_status()
data = resp.json()
educations = []
for el in data.get("elements", []):
educations.append({
"school": el.get("schoolName", ""),
"degree": el.get("degreeName", ""),
"field": el.get("fieldOfStudy", ""),
"start_year": el.get("timePeriod", {}).get("startDate", {}).get("year"),
"end_year": el.get("timePeriod", {}).get("endDate", {}).get("year"),
})
return educations
Extracting Skills and Endorsements
Skills data is structured but buried in nested responses:
def get_detailed_skills(scraper: LinkedInScraper, vanity_name: str) -> list[dict]:
"""Get skills sorted by endorsement count."""
skills = scraper.get_skills(vanity_name)
skills.sort(key=lambda s: s["endorsement_count"], reverse=True)
print(f"\nSkills for {vanity_name}:")
print("-" * 50)
for skill in skills[:20]:
bar = "#" * min(skill["endorsement_count"] // 5, 30)
print(f" {skill['name']:35s} {skill['endorsement_count']:4d} {bar}")
return skills
# Usage
li_at = "YOUR_LI_AT_COOKIE_VALUE"
scraper = LinkedInScraper(li_at)
skills = get_detailed_skills(scraper, "williamhgates")
contact = scraper.get_contact_info("williamhgates")
print(f"Websites: {contact['websites']}")
Parsing the Normalized JSON Response
LinkedIn's dash API returns data in a normalized format requiring assembly:
def parse_full_profile(raw_response: dict) -> dict:
"""Assemble a flat profile dict from LinkedIn's normalized JSON."""
included = {
el.get("entityUrn"): el
for el in raw_response.get("included", [])
if "entityUrn" in el
}
profile = None
for el in raw_response.get("included", []):
if el.get("$type") == "com.linkedin.voyager.dash.identity.profile.Profile":
profile = el
break
if not profile:
return {}
name = profile.get("firstName", "") + " " + profile.get("lastName", "")
# Extract positions from included
positions = []
for el in raw_response.get("included", []):
if el.get("$type") == "com.linkedin.voyager.dash.identity.profile.Position":
company_urn = el.get("company", {}).get("entityUrn", "")
company = included.get(company_urn, {})
positions.append({
"title": el.get("title", ""),
"company": company.get("name", el.get("companyName", "")),
"company_industry": company.get("industries", [None])[0] if company.get("industries") else None,
"start": el.get("dateRange", {}).get("start", {}),
"end": el.get("dateRange", {}).get("end"),
"description": el.get("description", ""),
})
return {
"name": name.strip(),
"headline": profile.get("headline", ""),
"summary": profile.get("summary", ""),
"location": profile.get("geoLocationName", ""),
"positions": positions,
"followers": profile.get("followingInfo", {}).get("followerCount", 0),
"connections": profile.get("connections", {}).get("paging", {}).get("total", 0),
}
Connection Graph Mapping
You can map out a user's visible connections — useful for network analysis and sales intelligence:
def map_connections(
scraper: LinkedInScraper,
vanity_name: str,
max_results: int = 100,
) -> list[dict]:
"""Scrape a user's visible first and second-degree connections."""
url = f"{scraper.BASE_URL}/search/dash/clusters"
connections = []
start = 0
count = 10 # LinkedIn caps at 10 per request
while start < max_results:
params = {
"decorationId": (
"com.linkedin.voyager.dash.deco.search"
".SearchClusterCollection-186"
),
"origin": "MEMBER_PROFILE_CANNED_SEARCH",
"q": "all",
"query": (
f"(flagshipSearchIntent:SEARCH_SRP,"
f"queryParameters:(connectionOf:List({vanity_name}),"
f"network:List(F,S),resultType:List(PEOPLE)))"
),
"start": start,
"count": count,
}
resp = scraper.session.get(url, params=params)
if resp.status_code == 429:
wait = random.uniform(45, 90)
print(f"Rate limited — sleeping {wait:.0f}s")
time.sleep(wait)
continue
resp.raise_for_status()
data = resp.json()
found_any = False
for element in data.get("included", []):
if "EntityResultViewModel" not in element.get("$type", ""):
continue
found_any = True
connections.append({
"name": element.get("title", {}).get("text", ""),
"headline": element.get("primarySubtitle", {}).get("text", ""),
"location": element.get("secondarySubtitle", {}).get("text", ""),
"profile_url": element.get("navigationUrl", ""),
"connection_degree": element.get("memberDistance", {}).get("value", ""),
})
if not found_any:
break
start += count
print(f"Fetched {len(connections)} connections so far")
time.sleep(random.uniform(5, 10))
return connections
People Search API
LinkedIn's people search is the real workhorse for lead generation and talent research:
def search_people(
scraper: LinkedInScraper,
keywords: str,
max_results: int = 50,
) -> list[dict]:
"""Search LinkedIn profiles by keyword."""
url = f"{scraper.BASE_URL}/search/dash/clusters"
results = []
start = 0
count = 10
while start < max_results:
params = {
"decorationId": "com.linkedin.voyager.dash.deco.search.SearchClusterCollection-186",
"origin": "GLOBAL_SEARCH_HEADER",
"q": "all",
"query": (
f"(flagshipSearchIntent:SEARCH_SRP,queryParameters:"
f"(keywords:List({keywords}),resultType:List(PEOPLE)))"
),
"start": start,
"count": count,
}
resp = scraper.session.get(url, params=params)
if resp.status_code == 429:
time.sleep(60)
continue
resp.raise_for_status()
data = resp.json()
found_any = False
for element in data.get("included", []):
if "EntityResultViewModel" not in element.get("$type", ""):
continue
found_any = True
results.append({
"name": element.get("title", {}).get("text", ""),
"headline": element.get("primarySubtitle", {}).get("text", ""),
"location": element.get("secondarySubtitle", {}).get("text", ""),
"profile_url": element.get("navigationUrl", ""),
"connection_degree": element.get("memberDistance", {}).get("value", ""),
})
if not found_any:
break
start += count
time.sleep(random.uniform(4, 8))
return results[:max_results]
Storing Profile Data in SQLite
For any serious lead generation or research workflow, you want a local database:
import sqlite3
import json
from datetime import datetime
def init_db(db_path: str) -> sqlite3.Connection:
conn = sqlite3.connect(db_path)
conn.executescript("""
CREATE TABLE IF NOT EXISTS profiles (
vanity_name TEXT PRIMARY KEY,
name TEXT,
headline TEXT,
summary TEXT,
location TEXT,
followers INTEGER,
connections INTEGER,
scraped_at TEXT,
raw_json TEXT
);
CREATE TABLE IF NOT EXISTS skills (
id INTEGER PRIMARY KEY AUTOINCREMENT,
vanity_name TEXT,
skill_name TEXT,
endorsement_count INTEGER,
UNIQUE(vanity_name, skill_name)
);
CREATE TABLE IF NOT EXISTS positions (
id INTEGER PRIMARY KEY AUTOINCREMENT,
vanity_name TEXT,
title TEXT,
company TEXT,
start_year INTEGER,
start_month INTEGER,
end_year INTEGER,
end_month INTEGER,
is_current INTEGER,
description TEXT
);
CREATE TABLE IF NOT EXISTS contact_info (
vanity_name TEXT PRIMARY KEY,
email TEXT,
phone TEXT,
twitter TEXT,
websites TEXT,
scraped_at TEXT
);
""")
conn.commit()
return conn
def store_profile(conn: sqlite3.Connection, vanity_name: str, scraper: LinkedInScraper):
"""Fetch and store a complete profile with all sub-data."""
now = datetime.utcnow().isoformat()
raw = scraper.get_profile(vanity_name)
profile = parse_full_profile(raw)
skills = scraper.get_skills(vanity_name)
positions = scraper.get_positions(vanity_name)
conn.execute(
"INSERT OR REPLACE INTO profiles "
"(vanity_name, name, headline, summary, location, followers, connections, scraped_at, raw_json) "
"VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)",
(vanity_name, profile.get("name"), profile.get("headline"),
profile.get("summary"), profile.get("location"),
profile.get("followers"), profile.get("connections"),
now, json.dumps(raw))
)
for skill in skills:
conn.execute(
"INSERT OR IGNORE INTO skills (vanity_name, skill_name, endorsement_count) "
"VALUES (?, ?, ?)",
(vanity_name, skill["name"], skill["endorsement_count"])
)
for pos in positions:
conn.execute(
"INSERT INTO positions "
"(vanity_name, title, company, start_year, start_month, "
"end_year, end_month, is_current, description) "
"VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)",
(vanity_name, pos["title"], pos["company"],
pos.get("start_year"), pos.get("start_month"),
pos.get("end_year"), pos.get("end_month"),
int(pos.get("is_current", False)), pos.get("description"))
)
conn.commit()
print(f"Stored profile: {profile.get('name')} ({vanity_name})")
Sales Navigator Difference
LinkedIn Sales Navigator uses the same Voyager API but with elevated permissions. A Sales Navigator li_at cookie unlocks:
- Full profile views without consuming "credits"
- Extended search filters (company size, revenue, technologies used)
- Lead lists and saved searches
- InMail data and response rates
- Account mapping features
- Buyer Intent signals and job change alerts
The API calls are nearly identical — the access level is determined by your account type, not different endpoints.
Anti-Detection Measures LinkedIn Uses
Request velocity tracking. A human doesn't view 100 profiles per minute. Keep to 1 profile every 10-15 seconds minimum. Connection mapping needs even longer delays between requests.
Session fingerprinting. Your li_at cookie is tied to specific behavioral patterns. Perfectly regular request timing is a dead giveaway for automation. Use random.uniform() for all delays.
IP correlation. If your account was created from a US IP but makes requests from a Romanian datacenter, that's a flag. ThorData residential proxies let you geo-target to match your account's expected location — city-level targeting is available, so a "San Francisco" LinkedIn account can use Bay Area residential IPs.
TLS fingerprinting. LinkedIn checks your TLS client hello against known browser fingerprints. A standard httpx client has a different TLS fingerprint than Chrome. Consider curl-cffi which impersonates Chrome's TLS fingerprint:
from curl_cffi import requests as cffi_requests
session = cffi_requests.Session(impersonate="chrome120")
session.cookies.update({"li_at": li_at_cookie})
Phantom profile views. LinkedIn tracks who views what. 500 profile views per day from a new account is a clear pattern. Use private browsing mode in account settings, or keep daily volume under 50.
Account restrictions. First offense: temporary restriction with warning. Second: CAPTCHA on every action. Third: account termination and potential IP ban.
Staying Under the Radar
Use a dedicated account. Never scrape with your real LinkedIn account. Create a separate one, fill out the profile legitimately (real-looking photo, complete work history, 50+ connections), and let it age for 2-3 weeks before starting.
Warm up slowly. Start with 10-20 profile views per day. Increase gradually over weeks. LinkedIn's ML models learn baseline behavior per account — sudden spikes from zero are flagged.
Mix in normal activity. Between scraping requests, load the feed, check notifications, like a post. Make the session look organic.
Rotate sessions. Use 2-3 hour windows with breaks between sessions. Mimic human working hours (don't scrape at 3am from a "US" account).
Respect rate limit thresholds: - Between individual profile fetches: 8-15 seconds - Between searches: 15-30 seconds - After 50 profiles: 10-minute break - After 200 profiles: done for the day
def scrape_profile_batch(
scraper: LinkedInScraper,
vanity_names: list[str],
db_conn: sqlite3.Connection,
):
"""Scrape a batch of profiles with human-like timing and auto-breaks."""
for i, name in enumerate(vanity_names):
print(f"[{i+1}/{len(vanity_names)}] Scraping {name}...")
try:
store_profile(db_conn, name, scraper)
except Exception as e:
print(f" Error: {e}")
if "429" in str(e) or "blocked" in str(e).lower():
print(" Rate limited — sleeping 5 minutes")
time.sleep(300)
continue
# Variable delay: mostly 8-15s, occasionally longer
delay = random.choices(
[random.uniform(8, 15), random.uniform(30, 60), random.uniform(120, 180)],
weights=[80, 15, 5],
)[0]
# Longer break every 20-30 profiles
if i > 0 and i % random.randint(20, 30) == 0:
break_time = random.uniform(600, 900)
print(f" Taking {break_time/60:.0f} min break after {i} profiles...")
time.sleep(break_time)
else:
print(f" Waiting {delay:.1f}s")
time.sleep(delay)
Using ThorData Residential Proxies
For sustained scraping operations, rotating residential proxies are essential. Datacenter IPs get flagged almost immediately by LinkedIn's detection systems. ThorData's residential proxy network provides IPs from real ISPs with city-level targeting:
def build_thordata_proxy(
user: str,
password: str,
country: str = "US",
city: str = "SanFrancisco",
) -> str:
"""Build ThorData proxy URL with geo-targeting."""
return (
f"http://{user}-country-{country}"
f"-city-{city}:{password}@proxy.thordata.com:9000"
)
# Create scraper with geo-matched residential proxy
proxy = build_thordata_proxy(
user="YOUR_USER",
password="YOUR_PASS",
country="US",
city="SanFrancisco", # Match your LinkedIn account's listed location
)
scraper = LinkedInScraper(li_at, proxy_url=proxy)
For rotating proxies between batches (recommended to avoid any single IP accumulating too many LinkedIn requests):
class RotatingLinkedInScraper:
"""LinkedIn scraper that rotates proxies between request batches."""
def __init__(self, li_at: str, proxy_base: str, rotate_every: int = 15):
self.li_at = li_at
self.proxy_base = proxy_base
self.rotate_every = rotate_every
self._scraper = None
self._count = 0
def _new_scraper(self) -> LinkedInScraper:
# Add random session parameter to get a new IP from ThorData
session_id = random.randint(10000, 99999)
proxy = f"{self.proxy_base}?session={session_id}"
return LinkedInScraper(self.li_at, proxy_url=proxy)
def get_profile(self, vanity_name: str) -> dict:
if self._scraper is None or self._count >= self.rotate_every:
self._scraper = self._new_scraper()
self._count = 0
self._count += 1
return self._scraper.get_profile(vanity_name)
Use Cases and Ethics
Legitimate uses for LinkedIn scraping include academic labor market research, recruitment intelligence, sales prospecting, job market analytics (tracking which skills are growing in demand), and company intelligence (monitoring leadership changes).
The line not to cross: scraping at massive scale for resale, building products competing directly with Sales Navigator or Recruiter, or combining LinkedIn data with other datasets to build surveillance tools.
LinkedIn scraping is a cat-and-mouse game. They have one of the best anti-bot teams in the industry and are not afraid to pursue legal action against commercial scraping operations. For personal research and small-scale data collection, the Voyager API approach works well — just be smart about velocity, use proper residential proxy infrastructure, and don't push your luck.