How to Scrape Instagram Profiles in 2026: Bio, Followers, Posts & Media
How to Scrape Instagram Profiles in 2026: Bio, Followers, Posts & Media
Instagram profile data powers influencer research tools, competitive analysis, audience intelligence platforms, and content benchmarking. Getting this data programmatically requires navigating Meta's evolving restrictions — Instagram has steadily locked down API access while making the platform more dependent on authenticated sessions.
This guide covers three approaches: public og:meta scraping (no auth, limited data), the private mobile API (auth required, full data), and GraphQL endpoint extraction. Each has different tradeoffs around data richness, reliability, and legal risk.
What Profile Data Is Available
A complete Instagram profile contains:
- Basic info: username, full name, bio, website URL, profile picture
- Stats: follower count, following count, post count
- Account type: personal, creator, or business
- Business info: category, contact options, address (for business accounts)
- Verification status: blue checkmark indicator
- Post feed: media items with like counts, comment counts, captions, timestamps
- Reels: short video content metadata
- Tagged posts: content where the user is tagged by others
- Highlights: story highlight collections
Without authentication, you can get basic bio/stats only. The private mobile API gives everything except private account content.
Approach 1: Public og:meta Scraping (No Auth)
Instagram's profile pages include OpenGraph meta tags for search engine indexing. These tags contain a condensed version of the profile bio and stats in the og:description field.
import requests
from html.parser import HTMLParser
import re
class OGParser(HTMLParser):
def __init__(self):
super().__init__()
self.og_data = {}
def handle_starttag(self, tag, attrs):
if tag == 'meta':
attrs_dict = dict(attrs)
prop = attrs_dict.get('property', '')
if prop.startswith('og:'):
self.og_data[prop] = attrs_dict.get('content', '')
def scrape_public_profile(username: str, proxy: dict = None) -> dict:
url = f'https://www.instagram.com/{username}/'
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 Chrome/131.0.0.0 Safari/537.36',
'Accept-Language': 'en-US,en;q=0.9',
}
resp = requests.get(url, headers=headers, proxies=proxy, timeout=15)
resp.raise_for_status()
parser = OGParser()
parser.feed(resp.text)
og = parser.og_data
# og:description format: 'X Followers, Y Following, Z Posts - See Instagram photos and videos from ...'
desc = og.get('og:description', '')
followers = following = posts = None
m = re.search(r'([\d,.]+)\s*Followers,\s*([\d,.]+)\s*Following,\s*([\d,.]+)\s*Posts', desc)
if m:
def parse_num(s): return int(s.replace(',', '').replace('.', ''))
followers = parse_num(m.group(1))
following = parse_num(m.group(2))
posts = parse_num(m.group(3))
return {
'username': username,
'title': og.get('og:title', ''),
'description': desc,
'profile_pic_url': og.get('og:image', ''),
'followers': followers,
'following': following,
'post_count': posts,
'profile_url': og.get('og:url', f'https://www.instagram.com/{username}/'),
}
# Works without authentication — but limited to what og:meta exposes
profile = scrape_public_profile('natgeo')
print(f"@{profile['username']}: {profile['followers']:,} followers")
Limitations: After 20-30 requests from the same IP, Instagram starts returning login redirect pages. This is the first thing to rate-limit. Residential proxies from ThorData extend the viable volume significantly by rotating IPs automatically.
Approach 2: Private Mobile API (Full Data)
The Instagram mobile app's private API returns rich JSON with complete profile data. You need a valid session cookie from a logged-in account.
import requests
import time
import random
SESSION_ID = 'your-session-id-from-browser'
APP_ID = '936619743392459'
HEADERS = {
'User-Agent': 'Instagram 317.0.0.34.109 Android (30/11; 420dpi; 1080x2220; samsung; SM-G991B; o1s; exynos2100)',
'X-IG-App-ID': APP_ID,
'Accept': 'application/json',
'Accept-Language': 'en-US',
}
COOKIES = {'sessionid': SESSION_ID}
def get_profile_by_username(username: str) -> dict:
url = 'https://i.instagram.com/api/v1/users/web_profile_info/'
params = {'username': username}
resp = requests.get(url, params=params, headers=HEADERS, cookies=COOKIES, timeout=15)
resp.raise_for_status()
return resp.json()
def get_profile_by_id(user_id: str) -> dict:
url = f'https://i.instagram.com/api/v1/users/{user_id}/info/'
resp = requests.get(url, headers=HEADERS, cookies=COOKIES, timeout=15)
resp.raise_for_status()
return resp.json()
def extract_profile_data(api_response: dict) -> dict:
user = api_response.get('data', {}).get('user', {})
if not user:
user = api_response.get('user', {})
followers = user.get('edge_followed_by', {}).get('count', 0)
following = user.get('edge_follow', {}).get('count', 0)
post_count = user.get('edge_owner_to_timeline_media', {}).get('count', 0)
return {
'id': user.get('id'),
'username': user.get('username'),
'full_name': user.get('full_name'),
'biography': user.get('biography'),
'website': user.get('external_url'),
'profile_pic_url': user.get('profile_pic_url_hd') or user.get('profile_pic_url'),
'followers': followers,
'following': following,
'post_count': post_count,
'is_verified': user.get('is_verified', False),
'is_private': user.get('is_private', False),
'is_business': user.get('is_business_account', False),
'business_category': user.get('business_category_name'),
'is_professional': user.get('is_professional_account', False),
}
# Fetch and parse a profile
raw = get_profile_by_username('natgeo')
profile = extract_profile_data(raw)
print(f"@{profile['username']} ({profile['full_name']})")
print(f" Followers: {profile['followers']:,}")
print(f" Following: {profile['following']:,}")
print(f" Posts: {profile['post_count']:,}")
print(f" Verified: {profile['is_verified']}")
print(f" Bio: {profile['biography'][:100]}")
Fetching Post Feed
Once you have a user ID, you can fetch their post history with pagination:
def get_user_posts(user_id: str, max_pages: int = 5) -> list[dict]:
posts = []
url = f'https://i.instagram.com/api/v1/feed/user/{user_id}/'
max_id = None
for page_num in range(max_pages):
params = {}
if max_id:
params['max_id'] = max_id
resp = requests.get(url, params=params, headers=HEADERS, cookies=COOKIES, timeout=15)
if resp.status_code == 429:
print(f' Rate limited on page {page_num}, waiting 60s')
time.sleep(60)
continue
resp.raise_for_status()
data = resp.json()
for item in data.get('items', []):
caption_obj = item.get('caption') or {}
# Get image URL
image_url = None
if item.get('image_versions2'):
candidates = item['image_versions2'].get('candidates', [])
if candidates:
image_url = candidates[0]['url']
media_type = item.get('media_type', 1)
type_map = {1: 'photo', 2: 'video', 8: 'carousel'}
posts.append({
'id': item.get('pk'),
'shortcode': item.get('code'),
'media_type': type_map.get(media_type, 'photo'),
'like_count': item.get('like_count', 0),
'comment_count': item.get('comment_count', 0),
'view_count': item.get('view_count'),
'taken_at': item.get('taken_at'),
'caption': caption_obj.get('text', ''),
'image_url': image_url,
'post_url': f"https://www.instagram.com/p/{item.get('code')}/" if item.get('code') else None,
'tagged_users': [
t.get('user', {}).get('username')
for t in item.get('usertags', {}).get('in', [])
],
'location': item.get('location', {}).get('name') if item.get('location') else None,
})
if not data.get('more_available'):
break
max_id = data.get('next_max_id')
time.sleep(random.uniform(2, 4))
return posts
# Fetch posts for a user
raw = get_profile_by_username('natgeo')
profile = extract_profile_data(raw)
posts = get_user_posts(profile['id'], max_pages=3)
print(f'Fetched {len(posts)} posts')
for p in posts[:5]:
print(f" {p['taken_at']}: {p['like_count']:,} likes, {p['comment_count']:,} comments ({p['media_type']})")
Fetching Reels Data
def get_user_reels(user_id: str, max_pages: int = 3) -> list[dict]:
url = f'https://i.instagram.com/api/v1/clips/user/'
reels = []
max_id = None
for _ in range(max_pages):
payload = {
'target_user_id': user_id,
'page_size': 12,
}
if max_id:
payload['max_id'] = max_id
resp = requests.post(url, data=payload, headers=HEADERS, cookies=COOKIES, timeout=15)
if resp.status_code != 200:
break
data = resp.json()
for item in data.get('items', []):
media = item.get('media', {})
reels.append({
'id': media.get('pk'),
'shortcode': media.get('code'),
'like_count': media.get('like_count', 0),
'comment_count': media.get('comment_count', 0),
'play_count': media.get('play_count', 0),
'taken_at': media.get('taken_at'),
'duration': media.get('video_duration'),
'caption': (media.get('caption') or {}).get('text', ''),
'post_url': f"https://www.instagram.com/reel/{media.get('code')}/",
})
if not data.get('paging_info', {}).get('more_available'):
break
max_id = data.get('paging_info', {}).get('max_id')
time.sleep(random.uniform(2, 4))
return reels
Bulk Profile Collection
For research requiring many profiles, use a pipeline with session rotation and proxy support:
import json
from pathlib import Path
from datetime import datetime
# Multiple sessions to distribute load
SESSION_POOL = [
'session_id_1',
'session_id_2',
'session_id_3',
]
# ThorData residential proxy — https://thordata.partnerstack.com/partner/0a0x4nzb (or [Oxylabs](https://oxylabs.go2cloud.org/aff_c?offer_id=7&aff_id=2066&url_id=174))
PROXY_URL = 'http://USER:[email protected]:9000'
def make_session(session_id: str) -> requests.Session:
s = requests.Session()
s.headers.update(HEADERS)
s.cookies.set('sessionid', session_id, domain='.instagram.com')
s.proxies = {'http': PROXY_URL, 'https': PROXY_URL}
return s
def collect_profiles_bulk(
usernames: list[str],
output_dir: str = 'profiles',
include_posts: bool = True,
posts_per_profile: int = 20,
) -> list[dict]:
out = Path(output_dir)
out.mkdir(exist_ok=True)
results = []
session_idx = 0
for i, username in enumerate(usernames):
# Rotate session every 10 requests
if i % 10 == 0:
session_idx = (session_idx + 1) % len(SESSION_POOL)
session = make_session(SESSION_POOL[session_idx])
print(f'Fetching @{username} ({i+1}/{len(usernames)})')
try:
url = 'https://i.instagram.com/api/v1/users/web_profile_info/'
resp = session.get(url, params={'username': username}, timeout=15)
if resp.status_code == 429:
print(' Rate limited — waiting 90s')
time.sleep(90)
continue
resp.raise_for_status()
profile_data = extract_profile_data(resp.json())
if include_posts and not profile_data.get('is_private'):
user_id = profile_data.get('id')
if user_id:
# Use a smaller page count for bulk collection
post_pages = max(1, posts_per_profile // 12)
posts = get_user_posts(user_id, max_pages=post_pages)
profile_data['posts'] = posts
time.sleep(random.uniform(1.5, 3))
results.append(profile_data)
# Save individual profile
profile_file = out / f'{username}.json'
profile_file.write_text(json.dumps(profile_data, indent=2, ensure_ascii=False))
except requests.exceptions.HTTPError as e:
print(f' Error for @{username}: {e}')
results.append({'username': username, 'error': str(e)})
time.sleep(random.uniform(2, 5))
# Save combined results
timestamp = datetime.now().strftime('%Y%m%d_%H%M')
combined = out / f'all_profiles_{timestamp}.json'
combined.write_text(json.dumps(results, indent=2, ensure_ascii=False))
print(f'Saved {len(results)} profiles to {combined}')
return results
Engagement Rate Calculation
Engagement rate is the key metric for influencer analysis:
import statistics
def calculate_engagement_rate(profile: dict, posts: list[dict]) -> dict:
followers = profile.get('followers', 0)
if not followers or not posts:
return {'engagement_rate': None}
# Use the most recent N posts
recent_posts = sorted(posts, key=lambda p: p.get('taken_at', 0), reverse=True)[:20]
interactions = [
p.get('like_count', 0) + p.get('comment_count', 0)
for p in recent_posts
]
avg_interactions = statistics.mean(interactions) if interactions else 0
engagement_rate = (avg_interactions / followers) * 100
return {
'engagement_rate': round(engagement_rate, 4),
'avg_likes': statistics.mean([p.get('like_count', 0) for p in recent_posts]),
'avg_comments': statistics.mean([p.get('comment_count', 0) for p in recent_posts]),
'avg_interactions': avg_interactions,
'posts_analyzed': len(recent_posts),
}
# Industry benchmarks:
# < 1% = Low engagement
# 1-3% = Average
# 3-6% = High engagement
# > 6% = Very high (common in micro-influencers)
CDN URL Expiry
Instagram image and video URLs expire within 24-48 hours. The expiry timestamp is embedded in the URL parameters. Always download media immediately — do not store CDN URLs and expect them to work later:
import httpx
from pathlib import Path
import re
from urllib.parse import urlparse, parse_qs
def get_url_expiry_timestamp(cdn_url: str) -> int | None:
# Some URLs have expiry in hex (oe= param), others in path
parsed = urlparse(cdn_url)
params = parse_qs(parsed.query)
if 'oe' in params:
return int(params['oe'][0], 16) # hex timestamp
match = re.search(r'[/_]e(\d{10})', cdn_url)
if match:
return int(match.group(1))
return None
def download_media_batch(posts: list[dict], output_dir: str = 'media') -> None:
out = Path(output_dir)
out.mkdir(exist_ok=True)
with httpx.Client(timeout=30) as client:
for post in posts:
post_id = post.get('id', 'unknown')
img_url = post.get('image_url')
if img_url:
try:
r = client.get(img_url)
r.raise_for_status()
(out / f'{post_id}.jpg').write_bytes(r.content)
except Exception as e:
print(f' Failed to download {post_id}: {e}')
time.sleep(0.5) # gentle on the CDN
Anti-Detection Best Practices
Instagram's detection systems look for these bot signals:
| Signal | Risk | Mitigation |
|---|---|---|
| Datacenter IP | Critical | Use residential proxies |
| Fixed request intervals | High | Randomize delays 1.5-5s |
| High volume per session | High | Rotate session cookies |
| Same IP + multiple accounts | High | Separate IPs per session |
| Python default user agent | Medium | Use mobile app user agent |
| Sequential username scraping | Medium | Randomize order |
For residential proxy rotation, ThorData provides pools of real residential IPs that distribute requests across different households and ISPs. Each request appears to come from a different user, which is the fundamental requirement for bypassing Instagram's per-IP rate limits.
# Proxy-aware request wrapper with retry logic
import time, random, requests
from typing import Optional
PROXY = {'http': 'http://USER:[email protected]:9000',
'https': 'http://USER:[email protected]:9000'}
def instagram_get(url: str, params: dict = None, max_retries: int = 3) -> Optional[dict]:
for attempt in range(max_retries):
try:
resp = requests.get(
url,
params=params,
headers=HEADERS,
cookies=COOKIES,
proxies=PROXY,
timeout=15,
)
if resp.status_code == 429:
wait = (2 ** attempt) * 30 # exponential backoff: 30s, 60s, 120s
print(f' Rate limited — waiting {wait}s')
time.sleep(wait)
continue
if resp.status_code == 401:
print(' Session expired — need new session cookie')
return None
resp.raise_for_status()
return resp.json()
except requests.exceptions.ConnectionError:
print(f' Connection error on attempt {attempt+1}')
time.sleep(5 * (attempt + 1))
except Exception as e:
print(f' Request failed: {e}')
return None
return None
Legal Considerations
Using Instagram's private API violates Meta's Terms of Service. Meta has sued and settled with several scraping operations. Courts have generally ruled that scraping publicly accessible data is not a CFAA violation (hiQ v. LinkedIn), but Meta's position is more aggressive, especially for data accessed via circumvented authentication.
The safest approach: use the og:meta method for basic profile stats (bio, follower counts) — this is genuinely public data. Use the private API sparingly for research purposes and keep volume low. Never sell scraped Instagram data or build a product that depends on sustained large-scale scraping.
Summary
Instagram profile scraping in 2026 requires either the limited og:meta approach (no auth, basic stats) or the private mobile API (session required, full data). The mobile API returns comprehensive profile data — followers, following, post counts, verification status, business type, and the full post feed with engagement metrics.
Rate limits are around 100-200 requests per day per session. Rotate session cookies across multiple accounts, use ThorData's residential proxies to distribute IP-level load, randomize delays between requests, and download media immediately since CDN URLs expire within 24-48 hours.
Influencer Discovery and Scoring
The most common commercial use case for profile scraping is building influencer databases. Here is a scoring system that ranks profiles by likely brand-partnership value:
import statistics
from typing import Optional
def score_influencer(
profile: dict,
posts: list[dict],
weight_reach: float = 0.3,
weight_engagement: float = 0.5,
weight_consistency: float = 0.2,
) -> dict:
followers = profile.get("followers", 0)
if followers < 1000:
return {"score": 0, "tier": "nano", "reason": "Too small"}
# Engagement rate
if posts:
recent = sorted(posts, key=lambda p: p.get("taken_at", 0), reverse=True)[:20]
interactions = [p.get("like_count", 0) + p.get("comment_count", 0) for p in recent]
avg_interactions = statistics.mean(interactions) if interactions else 0
engagement_rate = (avg_interactions / followers) * 100
else:
engagement_rate = 0.0
# Post consistency (posts per week, estimated from post timestamps)
posting_frequency = 0
if len(posts) >= 2:
sorted_posts = sorted(posts, key=lambda p: p.get("taken_at", 0))
oldest_ts = sorted_posts[0].get("taken_at", 0)
newest_ts = sorted_posts[-1].get("taken_at", 0)
if oldest_ts and newest_ts and newest_ts > oldest_ts:
weeks_span = (newest_ts - oldest_ts) / (7 * 24 * 3600)
posting_frequency = len(posts) / weeks_span if weeks_span > 0 else 0
# Normalize scores 0-100
# Reach score: log scale based on followers
import math
reach_score = min(100, math.log10(max(followers, 1)) / math.log10(10_000_000) * 100)
# Engagement score: benchmarks vary by tier
if followers < 10_000:
eng_benchmark = 5.0 # nano: 5%+ is excellent
elif followers < 100_000:
eng_benchmark = 3.0 # micro: 3%+ is excellent
elif followers < 1_000_000:
eng_benchmark = 2.0 # mid: 2%+ is excellent
else:
eng_benchmark = 1.0 # mega: 1%+ is excellent
eng_score = min(100, (engagement_rate / eng_benchmark) * 100)
# Consistency score: 3-7 posts/week is ideal
ideal_frequency = 5.0 # posts/week
freq_score = min(100, (posting_frequency / ideal_frequency) * 100) if posting_frequency > 0 else 0
overall_score = (
weight_reach * reach_score
+ weight_engagement * eng_score
+ weight_consistency * freq_score
)
# Tier classification
if followers < 10_000:
tier = "nano"
elif followers < 100_000:
tier = "micro"
elif followers < 500_000:
tier = "mid"
elif followers < 1_000_000:
tier = "macro"
else:
tier = "mega"
return {
"score": round(overall_score, 1),
"tier": tier,
"followers": followers,
"engagement_rate": round(engagement_rate, 3),
"posting_frequency_weekly": round(posting_frequency, 1),
"reach_score": round(reach_score, 1),
"engagement_score": round(eng_score, 1),
"consistency_score": round(freq_score, 1),
}
Niche Detection from Bio and Caption Analysis
Automatically categorize profiles into niches using text analysis:
import re
from collections import Counter
NICHE_PATTERNS = {
"fitness": r"\b(fitness|gym|workout|training|crossfit|yoga|nutrition|bodybuilding|weightlifting|hiit)\b",
"travel": r"\b(travel|wanderlust|adventure|explore|backpacking|nomad|destination|itinerary|vacation)\b",
"food": r"\b(food|recipe|cooking|chef|restaurant|foodie|gastronomy|cuisine|meal|delicious)\b",
"tech": r"\b(tech|coding|programming|developer|software|startup|saas|ai|machine learning|data science)\b",
"fashion": r"\b(fashion|style|outfit|ootd|streetwear|luxury|designer|brand|wear|clothing)\b",
"beauty": r"\b(beauty|makeup|skincare|cosmetics|haircare|grooming|glam|tutorial|swatch)\b",
"gaming": r"\b(gaming|gamer|esports|twitch|streamer|playstation|xbox|nintendo|fps|rpg)\b",
"business": r"\b(entrepreneur|business|ceo|founder|startup|investment|finance|money|wealth)\b",
"photography": r"\b(photography|photographer|photo|camera|lightroom|portrait|landscape|shoot)\b",
"music": r"\b(music|musician|singer|producer|dj|band|album|concert|studio|lyrics)\b",
}
def detect_niche(profile: dict, posts: list[dict] = None) -> dict:
bio = (profile.get("biography") or "").lower()
captions = " ".join(p.get("caption", "") for p in (posts or [])[:20]).lower()
combined_text = bio + " " + captions
niche_scores = {}
for niche, pattern in NICHE_PATTERNS.items():
matches = re.findall(pattern, combined_text, re.IGNORECASE)
niche_scores[niche] = len(matches)
if not any(niche_scores.values()):
return {"primary_niche": "general", "confidence": 0, "all_scores": niche_scores}
primary = max(niche_scores, key=niche_scores.get)
max_score = niche_scores[primary]
total_score = sum(niche_scores.values())
confidence = (max_score / total_score * 100) if total_score > 0 else 0
return {
"primary_niche": primary,
"confidence": round(confidence, 1),
"all_scores": niche_scores,
}
Profile Comparison and Benchmarking
Compare a set of profiles against each other to identify top performers in a niche:
import json
from pathlib import Path
from datetime import datetime
def benchmark_profiles(profiles_dir: str) -> list[dict]:
profiles_path = Path(profiles_dir)
profile_files = list(profiles_path.glob("*.json"))
all_profiles = []
for f in profile_files:
try:
data = json.loads(f.read_text())
if "username" in data:
all_profiles.append(data)
except Exception:
pass
if not all_profiles:
return []
# Add scores
scored_profiles = []
for profile in all_profiles:
posts = profile.get("posts", [])
score_data = score_influencer(profile, posts)
niche_data = detect_niche(profile, posts)
scored_profiles.append({
**profile,
"score": score_data.get("score", 0),
"tier": score_data.get("tier"),
"engagement_rate": score_data.get("engagement_rate"),
"primary_niche": niche_data.get("primary_niche"),
})
# Sort by score
scored_profiles.sort(key=lambda x: x.get("score", 0), reverse=True)
print(f"\nProfile Benchmark ({len(scored_profiles)} profiles):")
print(f"{'Username':<25} {'Followers':>10} {'Eng Rate':>9} {'Score':>7} {'Tier':<8} {'Niche'}")
print("-" * 80)
for p in scored_profiles[:20]:
followers = p.get("followers", 0)
eng = p.get("engagement_rate") or 0
score = p.get("score", 0)
print(
f"{p.get('username', 'N/A'):<25} {followers:>10,} "
f"{eng:>8.2f}% {score:>7.1f} {p.get('tier', 'N/A'):<8} {p.get('primary_niche', 'N/A')}"
)
return scored_profiles
Follower Growth Tracking
To track follower growth over time, take periodic snapshots:
import json
import time
import random
import requests
from pathlib import Path
from datetime import datetime
GROWTH_DB = Path("instagram_growth_tracker.json")
def load_growth_data() -> dict:
if GROWTH_DB.exists():
return json.loads(GROWTH_DB.read_text())
return {}
def save_growth_data(data: dict) -> None:
GROWTH_DB.write_text(json.dumps(data, indent=2))
def snapshot_profiles(
usernames: list[str],
session_id: str,
proxy_url: str = None,
) -> None:
growth_data = load_growth_data()
now = datetime.now().isoformat()
headers = {
"User-Agent": "Instagram 317.0.0.34.109 Android (30/11; 420dpi; 1080x2220; samsung; SM-G991B; o1s; exynos2100)",
"X-IG-App-ID": "936619743392459",
}
cookies = {"sessionid": session_id}
proxies = {"http": proxy_url, "https": proxy_url} if proxy_url else None
for username in usernames:
resp = requests.get(
"https://i.instagram.com/api/v1/users/web_profile_info/",
params={"username": username},
headers=headers,
cookies=cookies,
proxies=proxies,
timeout=15,
)
if resp.status_code == 429:
print(f" Rate limited — waiting 60s")
time.sleep(60)
continue
if resp.status_code != 200:
print(f" Failed @{username}: {resp.status_code}")
continue
user = resp.json().get("data", {}).get("user", {})
followers = user.get("edge_followed_by", {}).get("count", 0)
posts = user.get("edge_owner_to_timeline_media", {}).get("count", 0)
if username not in growth_data:
growth_data[username] = []
growth_data[username].append({
"timestamp": now,
"followers": followers,
"post_count": posts,
})
print(f" @{username}: {followers:,} followers")
time.sleep(random.uniform(2, 4))
save_growth_data(growth_data)
def calculate_follower_growth(username: str, days: int = 30) -> dict | None:
growth_data = load_growth_data()
snapshots = growth_data.get(username, [])
if len(snapshots) < 2:
return None
sorted_snaps = sorted(snapshots, key=lambda x: x["timestamp"])
# Get first and last in window
from datetime import timedelta
cutoff = (datetime.now() - timedelta(days=days)).isoformat()
window = [s for s in sorted_snaps if s["timestamp"] >= cutoff]
if len(window) < 2:
window = sorted_snaps[-2:]
first = window[0]
last = window[-1]
follower_gain = last["followers"] - first["followers"]
days_elapsed = (
datetime.fromisoformat(last["timestamp"]) - datetime.fromisoformat(first["timestamp"])
).total_seconds() / 86400
return {
"username": username,
"current_followers": last["followers"],
"follower_gain": follower_gain,
"daily_growth_avg": round(follower_gain / days_elapsed, 1) if days_elapsed > 0 else 0,
"growth_rate_pct": round(follower_gain / first["followers"] * 100, 3) if first["followers"] > 0 else 0,
"days_measured": round(days_elapsed, 1),
}
Integration with ThorData for Scale
When building a profile monitoring system that tracks hundreds of accounts, the volume quickly exceeds what a single session and IP can handle. Here is a production-ready integration with ThorData's residential proxy pool:
import requests
import random
import time
from typing import Optional
# ThorData residential proxy — https://thordata.partnerstack.com/partner/0a0x4nzb (or [Oxylabs](https://oxylabs.go2cloud.org/aff_c?offer_id=7&aff_id=2066&url_id=174))
THORDATA_USER = "your_user"
THORDATA_PASS = "your_pass"
THORDATA_HOST = "proxy.thordata.net"
THORDATA_PORT = 9000
def get_proxy_url(country_code: str = "us") -> str:
return f"http://{THORDATA_USER}:{THORDATA_PASS}-cc-{country_code}@{THORDATA_HOST}:{THORDATA_PORT}"
class InstagramProfileCollector:
def __init__(self, sessions: list[str], use_proxies: bool = True):
self.sessions = sessions
self.use_proxies = use_proxies
self.session_idx = 0
self.request_counts = {s: 0 for s in sessions}
self.countries = ["us", "gb", "ca", "au", "de"]
def _get_next_session(self) -> str:
# Find least-used available session
available = [
s for s in self.sessions
if self.request_counts.get(s, 0) < 80
]
if not available:
raise RuntimeError("All sessions at capacity")
return min(available, key=lambda s: self.request_counts.get(s, 0))
def _get_proxy(self) -> dict | None:
if not self.use_proxies:
return None
country = random.choice(self.countries)
proxy_url = get_proxy_url(country)
return {"http": proxy_url, "https": proxy_url}
def fetch_profile(self, username: str) -> Optional[dict]:
session_id = self._get_next_session()
proxies = self._get_proxy()
headers = {
"User-Agent": "Instagram 317.0.0.34.109 Android (30/11; 420dpi; 1080x2220; samsung; SM-G991B; o1s; exynos2100)",
"X-IG-App-ID": "936619743392459",
}
cookies = {"sessionid": session_id}
try:
resp = requests.get(
"https://i.instagram.com/api/v1/users/web_profile_info/",
params={"username": username},
headers=headers,
cookies=cookies,
proxies=proxies,
timeout=15,
)
if resp.status_code == 429:
self.request_counts[session_id] = 999 # mark as exhausted
return None
resp.raise_for_status()
self.request_counts[session_id] = self.request_counts.get(session_id, 0) + 1
return resp.json()
except Exception as e:
print(f" Error fetching @{username}: {e}")
return None
def collect_profiles(
self,
usernames: list[str],
delay_range: tuple = (2.0, 5.0),
) -> list[dict]:
results = []
for i, username in enumerate(usernames):
print(f"Fetching @{username} ({i+1}/{len(usernames)})")
raw = self.fetch_profile(username)
if raw:
profile = extract_profile_data(raw)
results.append(profile)
print(f" {profile.get('followers', 0):,} followers")
time.sleep(random.uniform(*delay_range))
return results
Output Formats and Integration
Profile data integrates well with business intelligence tools, CRM systems, and spreadsheets:
import csv
import json
from pathlib import Path
from datetime import datetime
def export_profiles(
profiles: list[dict],
output_dir: str = "instagram_exports",
include_posts: bool = False,
) -> dict:
out = Path(output_dir)
out.mkdir(exist_ok=True)
timestamp = datetime.now().strftime("%Y%m%d_%H%M")
# JSON (full data)
json_file = out / f"profiles_{timestamp}.json"
json_file.write_text(json.dumps(profiles, indent=2, ensure_ascii=False))
# CSV (summary, without nested post data)
csv_file = out / f"profiles_{timestamp}.csv"
csv_fields = [
"username", "full_name", "biography", "website",
"followers", "following", "post_count",
"is_verified", "is_private", "is_business",
"business_category", "profile_pic_url",
]
with open(csv_file, "w", newline="", encoding="utf-8") as f:
writer = csv.DictWriter(f, fieldnames=csv_fields, extrasaction="ignore")
writer.writeheader()
for profile in profiles:
# Flatten for CSV
row = {k: profile.get(k) for k in csv_fields}
writer.writerow(row)
print(f"Exported {len(profiles)} profiles:")
print(f" JSON: {json_file}")
print(f" CSV: {csv_file}")
return {"json": str(json_file), "csv": str(csv_file), "count": len(profiles)}
Audience Demographics Inference
While Instagram's API does not expose demographic data for arbitrary profiles, you can infer demographics from the content and engagement patterns:
import re
from collections import Counter
def infer_audience_demographics(posts: list[dict], profile: dict) -> dict:
bio = (profile.get("biography") or "").lower()
captions = " ".join(p.get("caption", "") for p in posts[:30]).lower()
combined = bio + " " + captions
# Age indicator keywords
age_signals = {
"gen_z": r"\b(gen z|genz|zoomer|tiktok|discord|stan|slay|vibe|aesthetic|no cap|fr fr)\b",
"millennial": r"\b(millennial|adulting|throwback|nostalgia|90s|2000s|hustle culture|wine mom)\b",
"professional": r"\b(ceo|founder|executive|linkedin|b2b|enterprise|corporate|professional)\b",
"parent": r"\b(mom|dad|parent|toddler|kids|parenting|family|babywearing|breastfeeding)\b",
}
age_scores = {group: len(re.findall(pattern, combined, re.IGNORECASE))
for group, pattern in age_signals.items()}
# Gender indicator (rough)
gender_signals = {
"female": r"\b(she/her|woman|girl|sis|ladies|feminist|girly|sisterhood|babe)\b",
"male": r"\b(he/him|man|guy|bro|brotherhood|masculine|gentlemen)\b",
}
gender_scores = {g: len(re.findall(p, combined, re.IGNORECASE))
for g, p in gender_signals.items()}
# Interest inference
interest_signals = {
"sports": r"\b(athlete|fitness|gym|sport|training|marathon|crossfit|swim|run|cycle)\b",
"lifestyle": r"\b(lifestyle|wellness|mindset|self.care|routine|morning|journal)\b",
"creative": r"\b(art|design|creative|photography|illustration|craft|sketch|drawing)\b",
"entrepreneur": r"\b(entrepreneur|startup|founder|business|passive income|side hustle)\b",
}
interest_scores = {interest: len(re.findall(pattern, combined, re.IGNORECASE))
for interest, pattern in interest_signals.items()}
return {
"likely_age_group": max(age_scores, key=age_scores.get) if any(age_scores.values()) else "unknown",
"age_signals": age_scores,
"likely_gender": max(gender_scores, key=gender_scores.get) if any(gender_scores.values()) else "unknown",
"top_interests": sorted(interest_scores.items(), key=lambda x: x[1], reverse=True)[:3],
}
Reels Performance vs. Static Posts
A key strategic insight for content creators: comparing Reel performance against static post performance:
import statistics
from collections import defaultdict
def compare_content_type_performance(
posts: list[dict],
reels: list[dict],
) -> dict:
# Static posts (photos only)
photos = [p for p in posts if p.get("media_type") == "photo"]
carousels = [p for p in posts if p.get("media_type") == "carousel"]
videos = [p for p in posts if p.get("media_type") == "video"]
def calc_stats(items: list[dict]) -> dict:
if not items:
return {"count": 0}
likes = [item.get("like_count", 0) for item in items]
comments = [item.get("comment_count", 0) for item in items]
views = [item.get("view_count", 0) for item in items if item.get("view_count")]
return {
"count": len(items),
"avg_likes": round(statistics.mean(likes), 0),
"median_likes": round(statistics.median(likes), 0),
"avg_comments": round(statistics.mean(comments), 1),
"avg_views": round(statistics.mean(views), 0) if views else None,
}
reel_stats = calc_stats(reels)
photo_stats = calc_stats(photos)
carousel_stats = calc_stats(carousels)
video_stats = calc_stats(videos)
# Calculate reels multiplier vs. photos
reels_vs_photos = None
if reel_stats.get("avg_likes") and photo_stats.get("avg_likes") and photo_stats["avg_likes"] > 0:
reels_vs_photos = round(reel_stats["avg_likes"] / photo_stats["avg_likes"], 2)
result = {
"photos": photo_stats,
"carousels": carousel_stats,
"videos": video_stats,
"reels": reel_stats,
"reels_vs_photos_multiplier": reels_vs_photos,
}
print("Content type performance comparison:")
for content_type, stats in result.items():
if isinstance(stats, dict) and stats.get("count"):
print(f" {content_type:<12}: {stats['count']} posts, "
f"{stats.get('avg_likes', 0):.0f} avg likes, "
f"{stats.get('avg_comments', 0):.1f} avg comments")
if reels_vs_photos:
print(f"\n Reels get {reels_vs_photos}x more likes than static photos for this account")
return result
Monitoring Profile Changes
Detect when a competitor account changes their bio, adds a link, or changes their profile picture — useful for competitive intelligence:
import json
import hashlib
import requests
from pathlib import Path
from datetime import datetime
PROFILE_HISTORY = Path("profile_change_history.json")
def load_profile_history() -> dict:
if PROFILE_HISTORY.exists():
return json.loads(PROFILE_HISTORY.read_text())
return {}
def save_profile_history(data: dict) -> None:
PROFILE_HISTORY.write_text(json.dumps(data, indent=2))
def profile_fingerprint(profile: dict) -> str:
key_fields = {
"biography": profile.get("biography", ""),
"website": profile.get("website", ""),
"followers": profile.get("followers", 0),
"post_count": profile.get("post_count", 0),
"is_verified": profile.get("is_verified", False),
}
return hashlib.md5(json.dumps(key_fields, sort_keys=True).encode()).hexdigest()
def detect_profile_changes(
username: str,
session_id: str,
proxy_url: str = None,
) -> list[str]:
history = load_profile_history()
now = datetime.now().isoformat()
# Fetch current profile
headers = {
"User-Agent": "Instagram 317.0.0.34.109 Android (30/11; 420dpi; 1080x2220; samsung; SM-G991B; o1s; exynos2100)",
"X-IG-App-ID": "936619743392459",
}
cookies = {"sessionid": session_id}
proxies = {"http": proxy_url, "https": proxy_url} if proxy_url else None
resp = requests.get(
"https://i.instagram.com/api/v1/users/web_profile_info/",
params={"username": username},
headers=headers,
cookies=cookies,
proxies=proxies,
timeout=15,
)
if resp.status_code != 200:
return []
current_profile = extract_profile_data(resp.json())
current_fp = profile_fingerprint(current_profile)
changes = []
if username in history:
last = history[username]["last_snapshot"]
last_fp = history[username]["fingerprint"]
if current_fp != last_fp:
# Detect specific changes
for field in ["biography", "website", "is_verified"]:
if last.get(field) != current_profile.get(field):
changes.append(f"{field} changed: '{last.get(field)}' -> '{current_profile.get(field)}'")
follower_diff = current_profile.get("followers", 0) - last.get("followers", 0)
if abs(follower_diff) > 1000:
changes.append(f"Follower count changed by {follower_diff:+,}")
if changes:
print(f"Changes detected for @{username}:")
for change in changes:
print(f" - {change}")
history[username] = {
"fingerprint": current_fp,
"last_snapshot": current_profile,
"last_checked": now,
}
save_profile_history(history)
return changes
Performance Optimization for Bulk Collection
When collecting hundreds of profiles, optimize throughput while respecting rate limits:
import asyncio
import json
import time
import random
import httpx
from pathlib import Path
from datetime import datetime
async def collect_profiles_async(
usernames: list[str],
sessions: list[str],
proxy_url: str = None,
concurrency: int = 3,
delay_range: tuple = (2.0, 5.0),
) -> list[dict]:
sem = asyncio.Semaphore(concurrency)
session_idx = 0
results = []
async def fetch_one(username: str) -> dict | None:
nonlocal session_idx
session_id = sessions[session_idx % len(sessions)]
session_idx += 1
headers = {
"User-Agent": "Instagram 317.0.0.34.109 Android (30/11; 420dpi; 1080x2220; samsung; SM-G991B; o1s; exynos2100)",
"X-IG-App-ID": "936619743392459",
}
cookies = {"sessionid": session_id}
proxy = proxy_url
async with sem:
try:
transport = httpx.AsyncHTTPTransport(proxy=proxy) if proxy else None
async with httpx.AsyncClient(transport=transport) as client:
resp = await client.get(
"https://i.instagram.com/api/v1/users/web_profile_info/",
params={"username": username},
headers=headers,
cookies=cookies,
timeout=15,
)
if resp.status_code == 429:
print(f" Rate limited for @{username}")
return None
if resp.status_code != 200:
return None
return extract_profile_data(resp.json())
except Exception as e:
print(f" Error for @{username}: {e}")
return None
finally:
await asyncio.sleep(random.uniform(*delay_range))
tasks = [fetch_one(u) for u in usernames]
raw_results = await asyncio.gather(*tasks)
return [r for r in raw_results if r]
# For running async from sync context
def collect_profiles_parallel(
usernames: list[str],
sessions: list[str],
proxy_url: str = None,
) -> list[dict]:
# https://thordata.partnerstack.com/partner/0a0x4nzb (or [Oxylabs](https://oxylabs.go2cloud.org/aff_c?offer_id=7&aff_id=2066&url_id=174))
return asyncio.run(
collect_profiles_async(usernames, sessions, proxy_url=proxy_url, concurrency=3)
)