← Back to blog

Advanced LinkedIn Profile Scraping Techniques (2026)

Advanced LinkedIn Profile Scraping Techniques (2026)

LinkedIn is probably the most legally contentious scraping target out there. The hiQ v. LinkedIn case established that scraping public data isn't a CFAA violation, but LinkedIn still aggressively blocks scrapers and will terminate accounts they catch. So tread carefully.

That said, LinkedIn's data is incredibly valuable for recruiters, researchers, and sales teams. And their official API is so restricted it's almost useless — you can barely access your own profile data through it.

The real power is in LinkedIn's Voyager API — the internal REST API that powers the web app.

Understanding the Voyager API

When you browse LinkedIn in your browser, every profile load, search, and interaction goes through their Voyager API at https://www.linkedin.com/voyager/api/. Open your browser's network tab on LinkedIn and you'll see hundreds of these requests.

The key endpoints we care about:

Each endpoint returns normalized JSON where the actual data is split across data and included arrays. The included array contains all referenced objects — profile components, companies, schools — keyed by their URN identifiers.

Authentication Setup

You need a valid LinkedIn session. The Voyager API uses two tokens: li_at (session cookie) and a CSRF token extracted from JSESSIONID:

import httpx
import time
import random

class LinkedInScraper:
    BASE_URL = "https://www.linkedin.com/voyager/api"

    def __init__(self, li_at_cookie: str, proxy_url: str | None = None):
        transport = httpx.HTTPTransport(proxy=proxy_url) if proxy_url else None
        self.session = httpx.Client(
            headers={
                "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
                              "AppleWebKit/537.36 (KHTML, like Gecko) "
                              "Chrome/125.0.0.0 Safari/537.36",
                "Accept": "application/vnd.linkedin.normalized+json+2.1",
                "Accept-Language": "en-US,en;q=0.9",
                "X-Li-Lang": "en_US",
                "X-Li-Page-Instance": "urn:li:page:d_flagship3_profile_view_base",
                "X-Restli-Protocol-Version": "2.0.0",
            },
            cookies={"li_at": li_at_cookie},
            transport=transport,
            timeout=20,
            follow_redirects=True,
        )
        self._set_csrf_token()

    def _set_csrf_token(self):
        """Extract CSRF token from session cookies."""
        resp = self.session.get("https://www.linkedin.com/feed/")
        csrf = self.session.cookies.get("JSESSIONID", "").strip('"')
        self.session.headers["Csrf-Token"] = csrf

    def get_profile(self, vanity_name: str) -> dict:
        """Fetch full profile data via the dash API."""
        url = f"{self.BASE_URL}/identity/dash/profiles"
        params = {
            "q": "memberIdentity",
            "memberIdentity": vanity_name,
            "decorationId": (
                "com.linkedin.voyager.dash.deco.identity.profile"
                ".FullProfileWithEntities-93"
            ),
        }
        resp = self.session.get(url, params=params)
        resp.raise_for_status()
        return resp.json()

    def get_skills(self, vanity_name: str) -> list[dict]:
        """Fetch skills and endorsement counts."""
        url = f"{self.BASE_URL}/identity/profiles/{vanity_name}/skills"
        params = {"count": 100, "start": 0}
        resp = self.session.get(url, params=params)
        resp.raise_for_status()
        data = resp.json()
        skills = []
        for element in data.get("elements", []):
            skills.append({
                "name": element.get("name", ""),
                "endorsement_count": element.get("endorsementCount", 0),
            })
        return skills

    def get_contact_info(self, vanity_name: str) -> dict:
        """Fetch email, phone, Twitter, websites."""
        url = (f"{self.BASE_URL}/identity/profiles/"
               f"{vanity_name}/profileContactInfo")
        resp = self.session.get(url)
        resp.raise_for_status()
        data = resp.json()
        return {
            "email": data.get("emailAddress"),
            "phone": [p.get("number") for p in data.get("phoneNumbers", [])],
            "twitter": [t.get("name") for t in data.get("twitterHandles", [])],
            "websites": [w.get("url") for w in data.get("websites", [])],
        }

    def get_positions(self, vanity_name: str) -> list[dict]:
        """Fetch work experience / position history."""
        url = f"{self.BASE_URL}/identity/profiles/{vanity_name}/positions"
        resp = self.session.get(url, params={"count": 50, "start": 0})
        resp.raise_for_status()
        data = resp.json()
        positions = []
        for el in data.get("elements", []):
            date_range = el.get("dateRange", {})
            start_dt = date_range.get("start", {})
            end_dt = date_range.get("end", {})
            positions.append({
                "title": el.get("title", ""),
                "company": el.get("companyName", ""),
                "start_year": start_dt.get("year"),
                "start_month": start_dt.get("month"),
                "end_year": end_dt.get("year"),
                "end_month": end_dt.get("month"),
                "is_current": end_dt == {},
                "description": el.get("description", ""),
                "location": el.get("locationName", ""),
            })
        return positions

    def get_educations(self, vanity_name: str) -> list[dict]:
        """Fetch education history."""
        url = f"{self.BASE_URL}/identity/profiles/{vanity_name}/educations"
        resp = self.session.get(url)
        resp.raise_for_status()
        data = resp.json()
        educations = []
        for el in data.get("elements", []):
            educations.append({
                "school": el.get("schoolName", ""),
                "degree": el.get("degreeName", ""),
                "field": el.get("fieldOfStudy", ""),
                "start_year": el.get("timePeriod", {}).get("startDate", {}).get("year"),
                "end_year": el.get("timePeriod", {}).get("endDate", {}).get("year"),
            })
        return educations

Extracting Skills and Endorsements

Skills data is structured but buried in nested responses:

def get_detailed_skills(scraper: LinkedInScraper, vanity_name: str) -> list[dict]:
    """Get skills sorted by endorsement count."""
    skills = scraper.get_skills(vanity_name)
    skills.sort(key=lambda s: s["endorsement_count"], reverse=True)

    print(f"\nSkills for {vanity_name}:")
    print("-" * 50)
    for skill in skills[:20]:
        bar = "#" * min(skill["endorsement_count"] // 5, 30)
        print(f"  {skill['name']:35s} {skill['endorsement_count']:4d} {bar}")

    return skills


# Usage
li_at = "YOUR_LI_AT_COOKIE_VALUE"
scraper = LinkedInScraper(li_at)

skills = get_detailed_skills(scraper, "williamhgates")
contact = scraper.get_contact_info("williamhgates")
print(f"Websites: {contact['websites']}")

Parsing the Normalized JSON Response

LinkedIn's dash API returns data in a normalized format requiring assembly:

def parse_full_profile(raw_response: dict) -> dict:
    """Assemble a flat profile dict from LinkedIn's normalized JSON."""
    included = {
        el.get("entityUrn"): el
        for el in raw_response.get("included", [])
        if "entityUrn" in el
    }

    profile = None
    for el in raw_response.get("included", []):
        if el.get("$type") == "com.linkedin.voyager.dash.identity.profile.Profile":
            profile = el
            break

    if not profile:
        return {}

    name = profile.get("firstName", "") + " " + profile.get("lastName", "")

    # Extract positions from included
    positions = []
    for el in raw_response.get("included", []):
        if el.get("$type") == "com.linkedin.voyager.dash.identity.profile.Position":
            company_urn = el.get("company", {}).get("entityUrn", "")
            company = included.get(company_urn, {})
            positions.append({
                "title": el.get("title", ""),
                "company": company.get("name", el.get("companyName", "")),
                "company_industry": company.get("industries", [None])[0] if company.get("industries") else None,
                "start": el.get("dateRange", {}).get("start", {}),
                "end": el.get("dateRange", {}).get("end"),
                "description": el.get("description", ""),
            })

    return {
        "name": name.strip(),
        "headline": profile.get("headline", ""),
        "summary": profile.get("summary", ""),
        "location": profile.get("geoLocationName", ""),
        "positions": positions,
        "followers": profile.get("followingInfo", {}).get("followerCount", 0),
        "connections": profile.get("connections", {}).get("paging", {}).get("total", 0),
    }

Connection Graph Mapping

You can map out a user's visible connections — useful for network analysis and sales intelligence:

def map_connections(
    scraper: LinkedInScraper,
    vanity_name: str,
    max_results: int = 100,
) -> list[dict]:
    """Scrape a user's visible first and second-degree connections."""
    url = f"{scraper.BASE_URL}/search/dash/clusters"
    connections = []
    start = 0
    count = 10  # LinkedIn caps at 10 per request

    while start < max_results:
        params = {
            "decorationId": (
                "com.linkedin.voyager.dash.deco.search"
                ".SearchClusterCollection-186"
            ),
            "origin": "MEMBER_PROFILE_CANNED_SEARCH",
            "q": "all",
            "query": (
                f"(flagshipSearchIntent:SEARCH_SRP,"
                f"queryParameters:(connectionOf:List({vanity_name}),"
                f"network:List(F,S),resultType:List(PEOPLE)))"
            ),
            "start": start,
            "count": count,
        }

        resp = scraper.session.get(url, params=params)

        if resp.status_code == 429:
            wait = random.uniform(45, 90)
            print(f"Rate limited — sleeping {wait:.0f}s")
            time.sleep(wait)
            continue

        resp.raise_for_status()
        data = resp.json()

        found_any = False
        for element in data.get("included", []):
            if "EntityResultViewModel" not in element.get("$type", ""):
                continue
            found_any = True
            connections.append({
                "name": element.get("title", {}).get("text", ""),
                "headline": element.get("primarySubtitle", {}).get("text", ""),
                "location": element.get("secondarySubtitle", {}).get("text", ""),
                "profile_url": element.get("navigationUrl", ""),
                "connection_degree": element.get("memberDistance", {}).get("value", ""),
            })

        if not found_any:
            break

        start += count
        print(f"Fetched {len(connections)} connections so far")
        time.sleep(random.uniform(5, 10))

    return connections

People Search API

LinkedIn's people search is the real workhorse for lead generation and talent research:

def search_people(
    scraper: LinkedInScraper,
    keywords: str,
    max_results: int = 50,
) -> list[dict]:
    """Search LinkedIn profiles by keyword."""
    url = f"{scraper.BASE_URL}/search/dash/clusters"
    results = []
    start = 0
    count = 10

    while start < max_results:
        params = {
            "decorationId": "com.linkedin.voyager.dash.deco.search.SearchClusterCollection-186",
            "origin": "GLOBAL_SEARCH_HEADER",
            "q": "all",
            "query": (
                f"(flagshipSearchIntent:SEARCH_SRP,queryParameters:"
                f"(keywords:List({keywords}),resultType:List(PEOPLE)))"
            ),
            "start": start,
            "count": count,
        }

        resp = scraper.session.get(url, params=params)
        if resp.status_code == 429:
            time.sleep(60)
            continue
        resp.raise_for_status()
        data = resp.json()

        found_any = False
        for element in data.get("included", []):
            if "EntityResultViewModel" not in element.get("$type", ""):
                continue
            found_any = True
            results.append({
                "name": element.get("title", {}).get("text", ""),
                "headline": element.get("primarySubtitle", {}).get("text", ""),
                "location": element.get("secondarySubtitle", {}).get("text", ""),
                "profile_url": element.get("navigationUrl", ""),
                "connection_degree": element.get("memberDistance", {}).get("value", ""),
            })

        if not found_any:
            break

        start += count
        time.sleep(random.uniform(4, 8))

    return results[:max_results]

Storing Profile Data in SQLite

For any serious lead generation or research workflow, you want a local database:

import sqlite3
import json
from datetime import datetime

def init_db(db_path: str) -> sqlite3.Connection:
    conn = sqlite3.connect(db_path)
    conn.executescript("""
        CREATE TABLE IF NOT EXISTS profiles (
            vanity_name     TEXT PRIMARY KEY,
            name            TEXT,
            headline        TEXT,
            summary         TEXT,
            location        TEXT,
            followers       INTEGER,
            connections     INTEGER,
            scraped_at      TEXT,
            raw_json        TEXT
        );

        CREATE TABLE IF NOT EXISTS skills (
            id              INTEGER PRIMARY KEY AUTOINCREMENT,
            vanity_name     TEXT,
            skill_name      TEXT,
            endorsement_count INTEGER,
            UNIQUE(vanity_name, skill_name)
        );

        CREATE TABLE IF NOT EXISTS positions (
            id              INTEGER PRIMARY KEY AUTOINCREMENT,
            vanity_name     TEXT,
            title           TEXT,
            company         TEXT,
            start_year      INTEGER,
            start_month     INTEGER,
            end_year        INTEGER,
            end_month       INTEGER,
            is_current      INTEGER,
            description     TEXT
        );

        CREATE TABLE IF NOT EXISTS contact_info (
            vanity_name     TEXT PRIMARY KEY,
            email           TEXT,
            phone           TEXT,
            twitter         TEXT,
            websites        TEXT,
            scraped_at      TEXT
        );
    """)
    conn.commit()
    return conn


def store_profile(conn: sqlite3.Connection, vanity_name: str, scraper: LinkedInScraper):
    """Fetch and store a complete profile with all sub-data."""
    now = datetime.utcnow().isoformat()

    raw = scraper.get_profile(vanity_name)
    profile = parse_full_profile(raw)
    skills = scraper.get_skills(vanity_name)
    positions = scraper.get_positions(vanity_name)

    conn.execute(
        "INSERT OR REPLACE INTO profiles "
        "(vanity_name, name, headline, summary, location, followers, connections, scraped_at, raw_json) "
        "VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)",
        (vanity_name, profile.get("name"), profile.get("headline"),
         profile.get("summary"), profile.get("location"),
         profile.get("followers"), profile.get("connections"),
         now, json.dumps(raw))
    )

    for skill in skills:
        conn.execute(
            "INSERT OR IGNORE INTO skills (vanity_name, skill_name, endorsement_count) "
            "VALUES (?, ?, ?)",
            (vanity_name, skill["name"], skill["endorsement_count"])
        )

    for pos in positions:
        conn.execute(
            "INSERT INTO positions "
            "(vanity_name, title, company, start_year, start_month, "
            "end_year, end_month, is_current, description) "
            "VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)",
            (vanity_name, pos["title"], pos["company"],
             pos.get("start_year"), pos.get("start_month"),
             pos.get("end_year"), pos.get("end_month"),
             int(pos.get("is_current", False)), pos.get("description"))
        )

    conn.commit()
    print(f"Stored profile: {profile.get('name')} ({vanity_name})")

Sales Navigator Difference

LinkedIn Sales Navigator uses the same Voyager API but with elevated permissions. A Sales Navigator li_at cookie unlocks:

The API calls are nearly identical — the access level is determined by your account type, not different endpoints.

Anti-Detection Measures LinkedIn Uses

Request velocity tracking. A human doesn't view 100 profiles per minute. Keep to 1 profile every 10-15 seconds minimum. Connection mapping needs even longer delays between requests.

Session fingerprinting. Your li_at cookie is tied to specific behavioral patterns. Perfectly regular request timing is a dead giveaway for automation. Use random.uniform() for all delays.

IP correlation. If your account was created from a US IP but makes requests from a Romanian datacenter, that's a flag. ThorData residential proxies let you geo-target to match your account's expected location — city-level targeting is available, so a "San Francisco" LinkedIn account can use Bay Area residential IPs.

TLS fingerprinting. LinkedIn checks your TLS client hello against known browser fingerprints. A standard httpx client has a different TLS fingerprint than Chrome. Consider curl-cffi which impersonates Chrome's TLS fingerprint:

from curl_cffi import requests as cffi_requests

session = cffi_requests.Session(impersonate="chrome120")
session.cookies.update({"li_at": li_at_cookie})

Phantom profile views. LinkedIn tracks who views what. 500 profile views per day from a new account is a clear pattern. Use private browsing mode in account settings, or keep daily volume under 50.

Account restrictions. First offense: temporary restriction with warning. Second: CAPTCHA on every action. Third: account termination and potential IP ban.

Staying Under the Radar

Use a dedicated account. Never scrape with your real LinkedIn account. Create a separate one, fill out the profile legitimately (real-looking photo, complete work history, 50+ connections), and let it age for 2-3 weeks before starting.

Warm up slowly. Start with 10-20 profile views per day. Increase gradually over weeks. LinkedIn's ML models learn baseline behavior per account — sudden spikes from zero are flagged.

Mix in normal activity. Between scraping requests, load the feed, check notifications, like a post. Make the session look organic.

Rotate sessions. Use 2-3 hour windows with breaks between sessions. Mimic human working hours (don't scrape at 3am from a "US" account).

Respect rate limit thresholds: - Between individual profile fetches: 8-15 seconds - Between searches: 15-30 seconds - After 50 profiles: 10-minute break - After 200 profiles: done for the day

def scrape_profile_batch(
    scraper: LinkedInScraper,
    vanity_names: list[str],
    db_conn: sqlite3.Connection,
):
    """Scrape a batch of profiles with human-like timing and auto-breaks."""
    for i, name in enumerate(vanity_names):
        print(f"[{i+1}/{len(vanity_names)}] Scraping {name}...")

        try:
            store_profile(db_conn, name, scraper)
        except Exception as e:
            print(f"  Error: {e}")
            if "429" in str(e) or "blocked" in str(e).lower():
                print("  Rate limited — sleeping 5 minutes")
                time.sleep(300)
            continue

        # Variable delay: mostly 8-15s, occasionally longer
        delay = random.choices(
            [random.uniform(8, 15), random.uniform(30, 60), random.uniform(120, 180)],
            weights=[80, 15, 5],
        )[0]

        # Longer break every 20-30 profiles
        if i > 0 and i % random.randint(20, 30) == 0:
            break_time = random.uniform(600, 900)
            print(f"  Taking {break_time/60:.0f} min break after {i} profiles...")
            time.sleep(break_time)
        else:
            print(f"  Waiting {delay:.1f}s")
            time.sleep(delay)

Using ThorData Residential Proxies

For sustained scraping operations, rotating residential proxies are essential. Datacenter IPs get flagged almost immediately by LinkedIn's detection systems. ThorData's residential proxy network provides IPs from real ISPs with city-level targeting:

def build_thordata_proxy(
    user: str,
    password: str,
    country: str = "US",
    city: str = "SanFrancisco",
) -> str:
    """Build ThorData proxy URL with geo-targeting."""
    return (
        f"http://{user}-country-{country}"
        f"-city-{city}:{password}@proxy.thordata.com:9000"
    )

# Create scraper with geo-matched residential proxy
proxy = build_thordata_proxy(
    user="YOUR_USER",
    password="YOUR_PASS",
    country="US",
    city="SanFrancisco",  # Match your LinkedIn account's listed location
)
scraper = LinkedInScraper(li_at, proxy_url=proxy)

For rotating proxies between batches (recommended to avoid any single IP accumulating too many LinkedIn requests):

class RotatingLinkedInScraper:
    """LinkedIn scraper that rotates proxies between request batches."""

    def __init__(self, li_at: str, proxy_base: str, rotate_every: int = 15):
        self.li_at = li_at
        self.proxy_base = proxy_base
        self.rotate_every = rotate_every
        self._scraper = None
        self._count = 0

    def _new_scraper(self) -> LinkedInScraper:
        # Add random session parameter to get a new IP from ThorData
        session_id = random.randint(10000, 99999)
        proxy = f"{self.proxy_base}?session={session_id}"
        return LinkedInScraper(self.li_at, proxy_url=proxy)

    def get_profile(self, vanity_name: str) -> dict:
        if self._scraper is None or self._count >= self.rotate_every:
            self._scraper = self._new_scraper()
            self._count = 0
        self._count += 1
        return self._scraper.get_profile(vanity_name)

Use Cases and Ethics

Legitimate uses for LinkedIn scraping include academic labor market research, recruitment intelligence, sales prospecting, job market analytics (tracking which skills are growing in demand), and company intelligence (monitoring leadership changes).

The line not to cross: scraping at massive scale for resale, building products competing directly with Sales Navigator or Recruiter, or combining LinkedIn data with other datasets to build surveillance tools.

LinkedIn scraping is a cat-and-mouse game. They have one of the best anti-bot teams in the industry and are not afraid to pursue legal action against commercial scraping operations. For personal research and small-scale data collection, the Voyager API approach works well — just be smart about velocity, use proper residential proxy infrastructure, and don't push your luck.