Scraping Twitter/X Followers and Following (2026)
Scraping Twitter/X Followers and Following (2026)
Twitter — or X, whatever you want to call it — has made scraping progressively harder since the Musk acquisition. The free API tier is basically useless for follower data, and the paid tiers are expensive. But the web app still needs to load this data somehow, and that's where we can intercept it.
The approach that works in 2026 is hitting Twitter's internal GraphQL API endpoints directly. These are the same endpoints the web client uses. They're undocumented and can change without notice, but they've been relatively stable for the past year.
This guide covers guest token authentication, user ID resolution, follower/following pagination, rate limit handling, proxy setup, SQLite storage, and practical gotchas you'll hit in production.
How Twitter's Internal API Works
The Twitter web app makes GraphQL requests to api.x.com/graphql/{query_id}/{operation}. The query_id is a hash that identifies the specific query in Twitter's internal schema. These IDs are embedded in the JavaScript bundles Twitter serves to browsers.
Authentication happens in two layers: 1. A long-lived Bearer token embedded in the JS bundle (public, stable for years) 2. A short-lived guest token generated per-session
Guest tokens work for public profiles without a login. For private accounts, you need authenticated cookies from an active browser session.
Installation
pip install httpx requests beautifulsoup4
Guest Token Authentication
import httpx
import json
import time
import random
import sqlite3
from datetime import datetime, timezone
# Public Bearer token embedded in Twitter's web app JS
# This has been the same for years
BEARER_TOKEN = (
"AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejRCOuH5E6I8xnZz4puTs"
"%3D1Zv7ttfk8LF81IUq16cHjhLTvJu4FA33AGWWjCpTnA"
)
BASE_HEADERS = {
"Authorization": f"Bearer {BEARER_TOKEN}",
"Content-Type": "application/json",
"Accept": "*/*",
"Accept-Language": "en-US,en;q=0.9",
"x-twitter-client-language": "en",
"x-twitter-active-user": "yes",
"Origin": "https://x.com",
"Referer": "https://x.com/",
}
def get_guest_token(client: httpx.Client = None) -> str:
"""Get a fresh guest token from Twitter."""
c = client or httpx.Client(headers=BASE_HEADERS, timeout=15)
resp = c.post(
"https://api.x.com/1.1/guest/activate.json",
headers={"Authorization": f"Bearer {BEARER_TOKEN}"},
)
resp.raise_for_status()
token = resp.json().get("guest_token")
if not token:
raise RuntimeError("Failed to get guest token")
return token
def make_headers(guest_token: str) -> dict:
"""Build request headers with a guest token."""
return {
**BASE_HEADERS,
"X-Guest-Token": guest_token,
}
# Get a fresh guest token
guest_token = get_guest_token()
print(f"Guest token: {guest_token[:20]}...")
Resolving Screen Names to User IDs
Twitter's GraphQL endpoints use numeric user IDs, not screen names:
def get_user_id(screen_name: str, guest_token: str, client: httpx.Client = None) -> str:
"""Resolve a Twitter/X screen name to a numeric user ID."""
url = "https://api.x.com/graphql/xmU6X_CKVnQ5lSrCbAmJsg/UserByScreenName"
variables = json.dumps({
"screen_name": screen_name,
"withSafetyModeUserFields": True,
})
features = json.dumps({
"hidden_profile_subscriptions_enabled": True,
"profile_label_improvements_pcf_label_in_post_enabled": False,
"rweb_tipjar_consumption_enabled": True,
"responsive_web_graphql_exclude_directive_enabled": True,
"verified_phone_label_enabled": False,
"responsive_web_graphql_skip_user_profile_image_extensions_enabled": False,
"responsive_web_graphql_timeline_navigation_enabled": True,
})
c = client or httpx.Client(timeout=20)
resp = c.get(
url,
params={"variables": variables, "features": features},
headers=make_headers(guest_token),
)
resp.raise_for_status()
data = resp.json()
result = (
data.get("data", {})
.get("user", {})
.get("result", {})
)
if not result:
raise RuntimeError(f"User @{screen_name} not found")
return result["rest_id"]
def get_user_profile(screen_name: str, guest_token: str) -> dict:
"""Get full profile data for a user."""
url = "https://api.x.com/graphql/xmU6X_CKVnQ5lSrCbAmJsg/UserByScreenName"
variables = json.dumps({"screen_name": screen_name, "withSafetyModeUserFields": True})
features = json.dumps({
"hidden_profile_subscriptions_enabled": True,
"responsive_web_graphql_exclude_directive_enabled": True,
"verified_phone_label_enabled": False,
"responsive_web_graphql_timeline_navigation_enabled": True,
})
resp = httpx.get(
url,
params={"variables": variables, "features": features},
headers=make_headers(guest_token),
timeout=20,
)
resp.raise_for_status()
result = resp.json().get("data", {}).get("user", {}).get("result", {})
legacy = result.get("legacy", {})
return {
"id": result.get("rest_id"),
"name": legacy.get("name"),
"screen_name": legacy.get("screen_name"),
"description": legacy.get("description"),
"followers_count": legacy.get("followers_count", 0),
"following_count": legacy.get("friends_count", 0),
"tweet_count": legacy.get("statuses_count", 0),
"listed_count": legacy.get("listed_count", 0),
"verified": result.get("is_blue_verified", False),
"location": legacy.get("location"),
"created_at": legacy.get("created_at"),
"profile_image_url": legacy.get("profile_image_url_https"),
}
Scraping Followers
The followers GraphQL endpoint uses cursor-based pagination:
def parse_timeline_users(data: dict) -> tuple[list[dict], str | None]:
"""Extract user records and next cursor from a GraphQL timeline response."""
users = []
next_cursor = None
timeline = (
data.get("data", {})
.get("user", {})
.get("result", {})
.get("timeline", {})
.get("timeline", {})
)
instructions = timeline.get("instructions", [])
for instruction in instructions:
entries = instruction.get("entries", [])
for entry in entries:
content = entry.get("content", {})
entry_type = content.get("entryType", "")
# Cursor entries — track "Bottom" for next page
if entry_type == "TimelineTimelineCursor":
if content.get("cursorType") == "Bottom":
next_cursor = content.get("value")
continue
# User entries
item = content.get("itemContent", {})
if item.get("itemType") != "TimelineUser":
continue
user_result = item.get("user_results", {}).get("result", {})
legacy = user_result.get("legacy", {})
if not legacy:
continue
users.append({
"id": user_result.get("rest_id", ""),
"name": legacy.get("name", ""),
"screen_name": legacy.get("screen_name", ""),
"description": legacy.get("description", ""),
"followers_count": legacy.get("followers_count", 0),
"following_count": legacy.get("friends_count", 0),
"tweet_count": legacy.get("statuses_count", 0),
"verified": user_result.get("is_blue_verified", False),
"location": legacy.get("location", ""),
"profile_image": legacy.get("profile_image_url_https", ""),
"created_at": legacy.get("created_at", ""),
})
return users, next_cursor
def scrape_followers(
user_id: str,
guest_token: str,
max_pages: int = 10,
proxy: str = None,
) -> list[dict]:
"""Scrape followers for a Twitter/X user via GraphQL."""
url = "https://api.x.com/graphql/rRXFSG5vR6drKr5BPitHew/Followers"
client_kwargs = {"headers": make_headers(guest_token), "timeout": 20}
if proxy:
client_kwargs["proxy"] = proxy
all_followers = []
cursor = None
tokens_used = 0
with httpx.Client(**client_kwargs) as client:
for page in range(max_pages):
variables = {
"userId": user_id,
"count": 20, # Twitter caps this at 20 for followers
"includePromotedContent": False,
}
if cursor:
variables["cursor"] = cursor
features = {
"rweb_tipjar_consumption_enabled": True,
"responsive_web_graphql_exclude_directive_enabled": True,
"verified_phone_label_enabled": False,
"responsive_web_graphql_timeline_navigation_enabled": True,
"responsive_web_graphql_skip_user_profile_image_extensions_enabled": False,
"creator_subscriptions_tweet_preview_api_enabled": True,
}
try:
resp = client.get(
url,
params={
"variables": json.dumps(variables),
"features": json.dumps(features),
},
)
tokens_used += 1
if resp.status_code == 429:
# Rate limited — get fresh guest token
print(f"Rate limited on page {page+1}. Getting fresh guest token...")
guest_token = get_guest_token()
client.headers.update({"X-Guest-Token": guest_token})
time.sleep(5)
continue
if resp.status_code == 403:
print(f"403 on page {page+1} — private account or endpoint changed")
break
resp.raise_for_status()
data = resp.json()
except httpx.TimeoutException:
print(f"Timeout on page {page+1}")
time.sleep(5)
continue
except Exception as e:
print(f"Error on page {page+1}: {e}")
break
users, next_cursor = parse_timeline_users(data)
all_followers.extend(users)
cursor = next_cursor
print(f" Page {page+1}: +{len(users)} followers ({len(all_followers)} total)")
if not cursor or not users:
break
# Rotate guest token every 50 requests
if tokens_used >= 50:
guest_token = get_guest_token()
client.headers.update({"X-Guest-Token": guest_token})
tokens_used = 0
time.sleep(random.uniform(3, 6))
return all_followers
# Example usage
guest_token = get_guest_token()
user_id = get_user_id("OpenAI", guest_token)
followers = scrape_followers(user_id, guest_token, max_pages=10)
print(f"\nCollected {len(followers)} followers")
Scraping Following (Who Someone Follows)
The Following endpoint works identically — just a different query ID:
def scrape_following(
user_id: str,
guest_token: str,
max_pages: int = 10,
proxy: str = None,
) -> list[dict]:
"""Scrape the accounts a user follows via GraphQL."""
url = "https://api.x.com/graphql/iSicc7LrzWGBgDPL0tM_TQ/Following"
client_kwargs = {"headers": make_headers(guest_token), "timeout": 20}
if proxy:
client_kwargs["proxy"] = proxy
all_following = []
cursor = None
with httpx.Client(**client_kwargs) as client:
for page in range(max_pages):
variables = {
"userId": user_id,
"count": 20,
"includePromotedContent": False,
}
if cursor:
variables["cursor"] = cursor
features = {
"rweb_tipjar_consumption_enabled": True,
"responsive_web_graphql_exclude_directive_enabled": True,
"verified_phone_label_enabled": False,
"responsive_web_graphql_timeline_navigation_enabled": True,
}
try:
resp = client.get(
url,
params={
"variables": json.dumps(variables),
"features": json.dumps(features),
},
)
if resp.status_code == 429:
time.sleep(30)
continue
resp.raise_for_status()
except Exception as e:
print(f"Error on page {page+1}: {e}")
break
users, next_cursor = parse_timeline_users(resp.json())
all_following.extend(users)
cursor = next_cursor
print(f" Page {page+1}: +{len(users)} following ({len(all_following)} total)")
if not cursor or not users:
break
time.sleep(random.uniform(3, 5))
return all_following
Rate Limits and Anti-Bot Measures
Twitter's rate limiting is aggressive. Here's what you'll encounter:
Guest token exhaustion. Each guest token gets roughly 150 requests before it returns 429s. Get a fresh token proactively every 50 requests rather than waiting for errors.
IP-based throttling. Even with fresh guest tokens, the same IP gets throttled after sustained use. Residential proxy rotation is essential for any serious volume. ThorData's residential proxies rotate IPs automatically, spreading the load so each IP stays within guest token limits:
PROXY_URL = "http://USER:[email protected]:9000"
# Pass proxy to scraping functions
followers = scrape_followers(user_id, guest_token, max_pages=20, proxy=PROXY_URL)
GraphQL endpoint changes. Twitter periodically changes the query IDs in their GraphQL URLs. The hash-like string in the URL (e.g., rRXFSG5vR6drKr5BPitHew) can change. If requests suddenly return 404, check Twitter's web app JavaScript for the new ID. A quick way: open Twitter in a browser, open DevTools Network tab, filter for graphql, then look for Followers requests.
Private accounts. Guest tokens only work for public profiles. Private accounts return 403. Scraping private accounts requires authenticated cookies from an active browser session — and also violates Twitter's ToS.
Browser fingerprinting. Twitter checks for consistent client behavior: headers, TLS fingerprint, request patterns. Using a residential proxy helps because requests blend in with normal browser traffic.
SQLite Storage for Follower Network Analysis
For network analysis and change detection:
def init_twitter_db(db_path: str = "twitter_followers.db") -> sqlite3.Connection:
"""Initialize a SQLite database for Twitter follower data."""
conn = sqlite3.connect(db_path)
conn.execute("PRAGMA journal_mode=WAL")
conn.executescript("""
CREATE TABLE IF NOT EXISTS twitter_users (
id TEXT PRIMARY KEY,
screen_name TEXT UNIQUE,
name TEXT,
description TEXT,
followers_count INTEGER,
following_count INTEGER,
tweet_count INTEGER,
verified INTEGER DEFAULT 0,
location TEXT,
created_at TEXT,
fetched_at TEXT DEFAULT (datetime('now'))
);
CREATE TABLE IF NOT EXISTS follower_edges (
id INTEGER PRIMARY KEY AUTOINCREMENT,
source_id TEXT NOT NULL, -- the account being followed
follower_id TEXT NOT NULL, -- the account doing the following
discovered_at TEXT DEFAULT (datetime('now')),
UNIQUE(source_id, follower_id)
);
CREATE TABLE IF NOT EXISTS following_edges (
id INTEGER PRIMARY KEY AUTOINCREMENT,
source_id TEXT NOT NULL, -- the account doing the following
target_id TEXT NOT NULL, -- the account being followed
discovered_at TEXT DEFAULT (datetime('now')),
UNIQUE(source_id, target_id)
);
CREATE INDEX IF NOT EXISTS idx_followers_source ON follower_edges(source_id);
CREATE INDEX IF NOT EXISTS idx_followers_follower ON follower_edges(follower_id);
CREATE INDEX IF NOT EXISTS idx_following_source ON following_edges(source_id);
CREATE INDEX IF NOT EXISTS idx_users_screen ON twitter_users(screen_name);
CREATE INDEX IF NOT EXISTS idx_users_followers ON twitter_users(followers_count DESC);
""")
conn.commit()
return conn
def store_users(conn: sqlite3.Connection, users: list[dict]) -> int:
"""Batch-insert/update user records."""
stored = 0
now = datetime.now(timezone.utc).isoformat()
for u in users:
if not u.get("id"):
continue
conn.execute("""
INSERT INTO twitter_users
(id, screen_name, name, description, followers_count,
following_count, tweet_count, verified, location, created_at, fetched_at)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
ON CONFLICT(id) DO UPDATE SET
screen_name=excluded.screen_name,
followers_count=excluded.followers_count,
fetched_at=excluded.fetched_at
""", (
u["id"], u.get("screen_name"), u.get("name"),
u.get("description"), u.get("followers_count", 0),
u.get("following_count", 0), u.get("tweet_count", 0),
int(u.get("verified", False)), u.get("location"),
u.get("created_at"), now,
))
stored += 1
conn.commit()
return stored
def store_follower_edges(
conn: sqlite3.Connection,
source_id: str,
followers: list[dict],
) -> int:
"""Store follower relationships."""
inserted = 0
for f in followers:
if not f.get("id"):
continue
try:
conn.execute("""
INSERT OR IGNORE INTO follower_edges (source_id, follower_id)
VALUES (?, ?)
""", (source_id, f["id"]))
inserted += conn.execute("SELECT changes()").fetchone()[0]
except sqlite3.Error:
pass
conn.commit()
return inserted
def get_mutual_followers(conn: sqlite3.Connection, user_a: str, user_b: str) -> list[str]:
"""Find users who follow both user_a and user_b."""
rows = conn.execute("""
SELECT fe1.follower_id
FROM follower_edges fe1
JOIN follower_edges fe2 ON fe1.follower_id = fe2.follower_id
WHERE fe1.source_id = ? AND fe2.source_id = ?
""", (user_a, user_b)).fetchall()
return [r[0] for r in rows]
def get_overlap_accounts(conn: sqlite3.Connection, user_id: str, min_followers: int = 1000) -> list[dict]:
"""Find high-follower accounts among the followers of a user."""
rows = conn.execute("""
SELECT u.screen_name, u.name, u.followers_count
FROM follower_edges fe
JOIN twitter_users u ON u.id = fe.follower_id
WHERE fe.source_id = ?
AND u.followers_count >= ?
ORDER BY u.followers_count DESC
LIMIT 50
""", (user_id, min_followers)).fetchall()
return [{"screen_name": r[0], "name": r[1], "followers": r[2]} for r in rows]
Practical Collection Pipeline
def collect_follower_network(
screen_name: str,
db_path: str = "twitter_followers.db",
max_pages: int = 50,
proxy: str = None,
) -> dict:
"""Collect followers for an account and store to SQLite."""
conn = init_twitter_db(db_path)
guest_token = get_guest_token()
# Resolve screen name to ID
print(f"Resolving @{screen_name}...")
profile = get_user_profile(screen_name, guest_token)
user_id = profile["id"]
store_users(conn, [profile])
print(f" {profile['name']} | {profile['followers_count']:,} followers | ID: {user_id}")
# Collect followers
print(f"\nScraping followers (max {max_pages} pages)...")
followers = scrape_followers(
user_id, guest_token, max_pages=max_pages, proxy=proxy
)
# Store to DB
users_stored = store_users(conn, followers)
edges_stored = store_follower_edges(conn, user_id, followers)
# Also collect following
print(f"\nScraping following list...")
following = scrape_following(
user_id, guest_token, max_pages=20, proxy=proxy
)
store_users(conn, following)
for f in following:
if f.get("id"):
try:
conn.execute(
"INSERT OR IGNORE INTO following_edges (source_id, target_id) VALUES (?, ?)",
(user_id, f["id"]),
)
except sqlite3.Error:
pass
conn.commit()
conn.close()
return {
"user": screen_name,
"user_id": user_id,
"followers_scraped": len(followers),
"following_scraped": len(following),
"users_stored": users_stored,
"edges_stored": edges_stored,
}
# Example run
result = collect_follower_network(
"anthropic",
max_pages=25,
proxy="http://USER:[email protected]:9000",
)
print(f"\nDone: {result['followers_scraped']} followers, {result['following_scraped']} following")
Practical Considerations
Rotate guest tokens proactively. Get a fresh one every 50 requests. Don't wait for a 429 — proactive rotation keeps runs smooth.
Save progress incrementally. Write followers to the database as you go. If you get rate limited on page 47 of 100, you don't want to lose everything.
The cursor is your bookmark. Save the last cursor value so you can resume pagination later without starting over. Add a cursor column to your job tracking table.
Don't try to scrape mega-accounts in one session. An account with 5 million followers requires hundreds of API calls. Spread it over multiple days with different proxy IPs. Store the cursor position between runs.
Watch for endpoint changes. Twitter periodically rotates the GraphQL query IDs. If you suddenly get 404s or empty results, the query hash likely changed. The fix is to check the network requests in Twitter's web app and update the URL.
Twitter scraping is a moving target — endpoints change, rate limits shift, new detection methods appear. But the fundamental approach of guest tokens + GraphQL + cursor pagination has remained consistent for over a year. Stay flexible, monitor for endpoint changes, and rotate your proxy infrastructure.
Detecting GraphQL Endpoint Changes
When Twitter updates their web app bundle, query IDs change. Here's how to detect and update them automatically:
import re
def extract_graphql_ids_from_bundle() -> dict[str, str]:
"""
Scrape Twitter's main JS bundle to extract current GraphQL query IDs.
Falls back to known IDs if the bundle isn't reachable.
"""
known_ids = {
"Followers": "rRXFSG5vR6drKr5BPitHew",
"Following": "iSicc7LrzWGBgDPL0tM_TQ",
"UserByScreenName": "xmU6X_CKVnQ5lSrCbAmJsg",
}
try:
# Fetch Twitter's main page to find JS bundle URLs
resp = httpx.get(
"https://x.com",
headers={
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
"AppleWebKit/537.36 Chrome/124.0.0.0 Safari/537.36"
},
timeout=15,
follow_redirects=True,
)
# Find JS bundle URLs
js_urls = re.findall(r'"(https://abs\.twimg\.com/responsive-web[^"]+\.js)"', resp.text)
if not js_urls:
return known_ids
# Search a couple of bundles for query IDs
for js_url in js_urls[:3]:
js_resp = httpx.get(js_url, timeout=15)
bundle = js_resp.text
# Pattern: queryId:"HASH",operationName:"NAME"
matches = re.findall(
r'queryId:"([a-zA-Z0-9_-]{20,})",operationName:"(Followers|Following|UserByScreenName)"',
bundle,
)
for query_id, operation in matches:
known_ids[operation] = query_id
except Exception as e:
print(f"Bundle extraction failed: {e}. Using cached IDs.")
return known_ids
# Update your endpoint URLs dynamically
def build_endpoint_urls() -> dict[str, str]:
ids = extract_graphql_ids_from_bundle()
return {
op: f"https://api.x.com/graphql/{qid}/{op}"
for op, qid in ids.items()
}
ENDPOINTS = build_endpoint_urls()
print("Current GraphQL endpoints:")
for op, url in ENDPOINTS.items():
print(f" {op}: {url}")
Follower Network Analysis
Once you have follower data in SQLite, network analysis reveals influence patterns:
```python def find_influential_followers( conn: sqlite3.Connection, source_id: str, min_followers: int = 10000, ) -> list[dict]: """Find high-influence accounts in someone's follower list.""" rows = conn.execute(""" SELECT u.screen_name, u.name, u.followers_count, u.verified, u.description FROM follower_edges fe JOIN twitter_users u ON u.id = fe.follower_id WHERE fe.source_id = ? AND u.followers_count >= ? ORDER BY u.followers_count DESC LIMIT 50 """, (source_id, min_followers)).fetchall()
return [
{
"screen_name": r[0], "name": r[1],
"followers": r[2], "verified": bool(r[3]),
"bio": r[4],
}
for r in rows
]
def compute_follower_overlap( conn: sqlite3.Connection, user_a_id: str, user_b_id: str, ) -> dict: """Calculate follower overlap between two accounts.""" total_a = conn.execute( "SELECT COUNT() FROM follower_edges WHERE source_id = ?", (user_a_id,) ).fetchone()[0] total_b = conn.execute( "SELECT COUNT() FROM follower_edges WHERE source_id = ?", (user_b_id,) ).fetchone()[0]
shared = conn.execute("""
SELECT COUNT(DISTINCT fe1.follower_id)
FROM follower_edges fe1
JOIN follower_edges fe2 ON fe1.follower_id = fe2.follower_id
WHERE fe1.source_id = ? AND fe2.source_id = ?
""", (user_a_id, user_b_id)).fetchone()[0]
union = total_a + total_b - shared
jaccard = shared / union if union > 0 else 0
return {
"followers_a": total_a,
"followers_b": total_b,
"shared": shared,
"jaccard_similarity": round(jaccard, 4),
"pct_of_a": round(shared / total_a * 100, 1) if total_a else 0,
"pct_of_b": round(shared / total_b * 100, 1) if total_b else 0,
}
def follower_growth_tracker( screen_names: list[str], db_path: str = "twitter_followers.db", proxy: str = None, ) -> None: """Track follower counts over time for a watchlist of accounts.""" conn = init_twitter_db(db_path) guest_token = get_guest_token()
# Add a tracking table
conn.execute("""
CREATE TABLE IF NOT EXISTS follower_count_history (
id INTEGER PRIMARY KEY AUTOINCREMENT,
user_id TEXT NOT NULL,
screen_name TEXT NOT NULL,
followers_count INTEGER,
following_count INTEGER,
tweet_count INTEGER,
recorded_at TEXT DEFAULT (datetime('now'))
)
""")
conn.commit()
for screen_name in screen_names:
try:
profile = get_user_profile(screen_name, guest_token)
conn.execute("""
INSERT INTO follower_count_history
(user_id, screen_name, followers_count, following_count, tweet_count)
VALUES (?, ?, ?, ?, ?)
""", (
profile["id"], profile["screen_name"],
profile["followers_count"], profile["following_count"],
profile["tweet_count"],
))
conn.commit()
print(f" @{screen_name}: {profile['followers_count']:,} followers")
# Refresh guest token periodically
guest_token = get_guest_token()
time.sleep(random.uniform(2, 4))
except Exception as e:
print(f" @{screen_name}: error — {e}")
conn.close()
Track a watchlist daily
watchlist = ["OpenAI", "AnthropicAI", "GoogleDeepMind", "xai"] follower_growth_tracker(watchlist)