How to Scrape Instagram Profiles in 2026: Bio, Followers, Posts & Media

2026-04-09 ["instagram" "profiles" "web scraping" "python" "social media"]

How to Scrape Instagram Profiles in 2026: Bio, Followers, Posts & Media

Instagram profile data powers influencer research tools, competitive analysis, audience intelligence platforms, and content benchmarking. Getting this data programmatically requires navigating Meta's evolving restrictions — Instagram has steadily locked down API access while making the platform more dependent on authenticated sessions.

This guide covers three approaches: public og:meta scraping (no auth, limited data), the private mobile API (auth required, full data), and GraphQL endpoint extraction. Each has different tradeoffs around data richness, reliability, and legal risk.

What Profile Data Is Available

A complete Instagram profile contains:

Basic info: username, full name, bio, website URL, profile picture
Stats: follower count, following count, post count
Account type: personal, creator, or business
Business info: category, contact options, address (for business accounts)
Verification status: blue checkmark indicator
Post feed: media items with like counts, comment counts, captions, timestamps
Reels: short video content metadata
Tagged posts: content where the user is tagged by others
Highlights: story highlight collections

Without authentication, you can get basic bio/stats only. The private mobile API gives everything except private account content.

Approach 1: Public og:meta Scraping (No Auth)

Instagram's profile pages include OpenGraph meta tags for search engine indexing. These tags contain a condensed version of the profile bio and stats in the og:description field.

import requests
from html.parser import HTMLParser
import re

class OGParser(HTMLParser):
    def __init__(self):
        super().__init__()
        self.og_data = {}

    def handle_starttag(self, tag, attrs):
        if tag == 'meta':
            attrs_dict = dict(attrs)
            prop = attrs_dict.get('property', '')
            if prop.startswith('og:'):
                self.og_data[prop] = attrs_dict.get('content', '')

def scrape_public_profile(username: str, proxy: dict = None) -> dict:
    url = f'https://www.instagram.com/{username}/'
    headers = {
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 Chrome/131.0.0.0 Safari/537.36',
        'Accept-Language': 'en-US,en;q=0.9',
    }
    resp = requests.get(url, headers=headers, proxies=proxy, timeout=15)
    resp.raise_for_status()

    parser = OGParser()
    parser.feed(resp.text)
    og = parser.og_data

    # og:description format: 'X Followers, Y Following, Z Posts - See Instagram photos and videos from ...'
    desc = og.get('og:description', '')
    followers = following = posts = None
    m = re.search(r'([\d,.]+)\s*Followers,\s*([\d,.]+)\s*Following,\s*([\d,.]+)\s*Posts', desc)
    if m:
        def parse_num(s): return int(s.replace(',', '').replace('.', ''))
        followers = parse_num(m.group(1))
        following = parse_num(m.group(2))
        posts = parse_num(m.group(3))

    return {
        'username': username,
        'title': og.get('og:title', ''),
        'description': desc,
        'profile_pic_url': og.get('og:image', ''),
        'followers': followers,
        'following': following,
        'post_count': posts,
        'profile_url': og.get('og:url', f'https://www.instagram.com/{username}/'),
    }

# Works without authentication — but limited to what og:meta exposes
profile = scrape_public_profile('natgeo')
print(f"@{profile['username']}: {profile['followers']:,} followers")

Limitations: After 20-30 requests from the same IP, Instagram starts returning login redirect pages. This is the first thing to rate-limit. Residential proxies from ThorData extend the viable volume significantly by rotating IPs automatically.

Approach 2: Private Mobile API (Full Data)

The Instagram mobile app's private API returns rich JSON with complete profile data. You need a valid session cookie from a logged-in account.

import requests
import time
import random

SESSION_ID = 'your-session-id-from-browser'
APP_ID = '936619743392459'

HEADERS = {
    'User-Agent': 'Instagram 317.0.0.34.109 Android (30/11; 420dpi; 1080x2220; samsung; SM-G991B; o1s; exynos2100)',
    'X-IG-App-ID': APP_ID,
    'Accept': 'application/json',
    'Accept-Language': 'en-US',
}

COOKIES = {'sessionid': SESSION_ID}

def get_profile_by_username(username: str) -> dict:
    url = 'https://i.instagram.com/api/v1/users/web_profile_info/'
    params = {'username': username}
    resp = requests.get(url, params=params, headers=HEADERS, cookies=COOKIES, timeout=15)
    resp.raise_for_status()
    return resp.json()

def get_profile_by_id(user_id: str) -> dict:
    url = f'https://i.instagram.com/api/v1/users/{user_id}/info/'
    resp = requests.get(url, headers=HEADERS, cookies=COOKIES, timeout=15)
    resp.raise_for_status()
    return resp.json()

def extract_profile_data(api_response: dict) -> dict:
    user = api_response.get('data', {}).get('user', {})
    if not user:
        user = api_response.get('user', {})

    followers = user.get('edge_followed_by', {}).get('count', 0)
    following = user.get('edge_follow', {}).get('count', 0)
    post_count = user.get('edge_owner_to_timeline_media', {}).get('count', 0)

    return {
        'id': user.get('id'),
        'username': user.get('username'),
        'full_name': user.get('full_name'),
        'biography': user.get('biography'),
        'website': user.get('external_url'),
        'profile_pic_url': user.get('profile_pic_url_hd') or user.get('profile_pic_url'),
        'followers': followers,
        'following': following,
        'post_count': post_count,
        'is_verified': user.get('is_verified', False),
        'is_private': user.get('is_private', False),
        'is_business': user.get('is_business_account', False),
        'business_category': user.get('business_category_name'),
        'is_professional': user.get('is_professional_account', False),
    }

# Fetch and parse a profile
raw = get_profile_by_username('natgeo')
profile = extract_profile_data(raw)
print(f"@{profile['username']} ({profile['full_name']})")
print(f"  Followers: {profile['followers']:,}")
print(f"  Following: {profile['following']:,}")
print(f"  Posts: {profile['post_count']:,}")
print(f"  Verified: {profile['is_verified']}")
print(f"  Bio: {profile['biography'][:100]}")

Fetching Post Feed

Once you have a user ID, you can fetch their post history with pagination:

def get_user_posts(user_id: str, max_pages: int = 5) -> list[dict]:
    posts = []
    url = f'https://i.instagram.com/api/v1/feed/user/{user_id}/'
    max_id = None

    for page_num in range(max_pages):
        params = {}
        if max_id:
            params['max_id'] = max_id

        resp = requests.get(url, params=params, headers=HEADERS, cookies=COOKIES, timeout=15)
        if resp.status_code == 429:
            print(f'  Rate limited on page {page_num}, waiting 60s')
            time.sleep(60)
            continue
        resp.raise_for_status()
        data = resp.json()

        for item in data.get('items', []):
            caption_obj = item.get('caption') or {}

            # Get image URL
            image_url = None
            if item.get('image_versions2'):
                candidates = item['image_versions2'].get('candidates', [])
                if candidates:
                    image_url = candidates[0]['url']

            media_type = item.get('media_type', 1)
            type_map = {1: 'photo', 2: 'video', 8: 'carousel'}

            posts.append({
                'id': item.get('pk'),
                'shortcode': item.get('code'),
                'media_type': type_map.get(media_type, 'photo'),
                'like_count': item.get('like_count', 0),
                'comment_count': item.get('comment_count', 0),
                'view_count': item.get('view_count'),
                'taken_at': item.get('taken_at'),
                'caption': caption_obj.get('text', ''),
                'image_url': image_url,
                'post_url': f"https://www.instagram.com/p/{item.get('code')}/" if item.get('code') else None,
                'tagged_users': [
                    t.get('user', {}).get('username')
                    for t in item.get('usertags', {}).get('in', [])
                ],
                'location': item.get('location', {}).get('name') if item.get('location') else None,
            })

        if not data.get('more_available'):
            break
        max_id = data.get('next_max_id')
        time.sleep(random.uniform(2, 4))

    return posts

# Fetch posts for a user
raw = get_profile_by_username('natgeo')
profile = extract_profile_data(raw)
posts = get_user_posts(profile['id'], max_pages=3)
print(f'Fetched {len(posts)} posts')
for p in posts[:5]:
    print(f"  {p['taken_at']}: {p['like_count']:,} likes, {p['comment_count']:,} comments ({p['media_type']})")

Fetching Reels Data

def get_user_reels(user_id: str, max_pages: int = 3) -> list[dict]:
    url = f'https://i.instagram.com/api/v1/clips/user/'
    reels = []
    max_id = None

    for _ in range(max_pages):
        payload = {
            'target_user_id': user_id,
            'page_size': 12,
        }
        if max_id:
            payload['max_id'] = max_id

        resp = requests.post(url, data=payload, headers=HEADERS, cookies=COOKIES, timeout=15)
        if resp.status_code != 200:
            break
        data = resp.json()

        for item in data.get('items', []):
            media = item.get('media', {})
            reels.append({
                'id': media.get('pk'),
                'shortcode': media.get('code'),
                'like_count': media.get('like_count', 0),
                'comment_count': media.get('comment_count', 0),
                'play_count': media.get('play_count', 0),
                'taken_at': media.get('taken_at'),
                'duration': media.get('video_duration'),
                'caption': (media.get('caption') or {}).get('text', ''),
                'post_url': f"https://www.instagram.com/reel/{media.get('code')}/",
            })

        if not data.get('paging_info', {}).get('more_available'):
            break
        max_id = data.get('paging_info', {}).get('max_id')
        time.sleep(random.uniform(2, 4))

    return reels

Bulk Profile Collection

For research requiring many profiles, use a pipeline with session rotation and proxy support:

import json
from pathlib import Path
from datetime import datetime

# Multiple sessions to distribute load
SESSION_POOL = [
    'session_id_1',
    'session_id_2',
    'session_id_3',
]

# ThorData residential proxy — https://thordata.partnerstack.com/partner/0a0x4nzb (or [Oxylabs](https://oxylabs.go2cloud.org/aff_c?offer_id=7&aff_id=2066&url_id=174))
PROXY_URL = 'http://USER:[email protected]:9000'

def make_session(session_id: str) -> requests.Session:
    s = requests.Session()
    s.headers.update(HEADERS)
    s.cookies.set('sessionid', session_id, domain='.instagram.com')
    s.proxies = {'http': PROXY_URL, 'https': PROXY_URL}
    return s

def collect_profiles_bulk(
    usernames: list[str],
    output_dir: str = 'profiles',
    include_posts: bool = True,
    posts_per_profile: int = 20,
) -> list[dict]:
    out = Path(output_dir)
    out.mkdir(exist_ok=True)
    results = []
    session_idx = 0

    for i, username in enumerate(usernames):
        # Rotate session every 10 requests
        if i % 10 == 0:
            session_idx = (session_idx + 1) % len(SESSION_POOL)
        session = make_session(SESSION_POOL[session_idx])

        print(f'Fetching @{username} ({i+1}/{len(usernames)})')
        try:
            url = 'https://i.instagram.com/api/v1/users/web_profile_info/'
            resp = session.get(url, params={'username': username}, timeout=15)
            if resp.status_code == 429:
                print('  Rate limited — waiting 90s')
                time.sleep(90)
                continue
            resp.raise_for_status()
            profile_data = extract_profile_data(resp.json())

            if include_posts and not profile_data.get('is_private'):
                user_id = profile_data.get('id')
                if user_id:
                    # Use a smaller page count for bulk collection
                    post_pages = max(1, posts_per_profile // 12)
                    posts = get_user_posts(user_id, max_pages=post_pages)
                    profile_data['posts'] = posts
                    time.sleep(random.uniform(1.5, 3))

            results.append(profile_data)

            # Save individual profile
            profile_file = out / f'{username}.json'
            profile_file.write_text(json.dumps(profile_data, indent=2, ensure_ascii=False))

        except requests.exceptions.HTTPError as e:
            print(f'  Error for @{username}: {e}')
            results.append({'username': username, 'error': str(e)})

        time.sleep(random.uniform(2, 5))

    # Save combined results
    timestamp = datetime.now().strftime('%Y%m%d_%H%M')
    combined = out / f'all_profiles_{timestamp}.json'
    combined.write_text(json.dumps(results, indent=2, ensure_ascii=False))
    print(f'Saved {len(results)} profiles to {combined}')
    return results

Engagement Rate Calculation

Engagement rate is the key metric for influencer analysis:

import statistics

def calculate_engagement_rate(profile: dict, posts: list[dict]) -> dict:
    followers = profile.get('followers', 0)
    if not followers or not posts:
        return {'engagement_rate': None}

    # Use the most recent N posts
    recent_posts = sorted(posts, key=lambda p: p.get('taken_at', 0), reverse=True)[:20]

    interactions = [
        p.get('like_count', 0) + p.get('comment_count', 0)
        for p in recent_posts
    ]

    avg_interactions = statistics.mean(interactions) if interactions else 0
    engagement_rate = (avg_interactions / followers) * 100

    return {
        'engagement_rate': round(engagement_rate, 4),
        'avg_likes': statistics.mean([p.get('like_count', 0) for p in recent_posts]),
        'avg_comments': statistics.mean([p.get('comment_count', 0) for p in recent_posts]),
        'avg_interactions': avg_interactions,
        'posts_analyzed': len(recent_posts),
    }

# Industry benchmarks:
# < 1% = Low engagement
# 1-3% = Average
# 3-6% = High engagement
# > 6% = Very high (common in micro-influencers)

CDN URL Expiry

Instagram image and video URLs expire within 24-48 hours. The expiry timestamp is embedded in the URL parameters. Always download media immediately — do not store CDN URLs and expect them to work later:

import httpx
from pathlib import Path
import re
from urllib.parse import urlparse, parse_qs

def get_url_expiry_timestamp(cdn_url: str) -> int | None:
    # Some URLs have expiry in hex (oe= param), others in path
    parsed = urlparse(cdn_url)
    params = parse_qs(parsed.query)
    if 'oe' in params:
        return int(params['oe'][0], 16)  # hex timestamp
    match = re.search(r'[/_]e(\d{10})', cdn_url)
    if match:
        return int(match.group(1))
    return None

def download_media_batch(posts: list[dict], output_dir: str = 'media') -> None:
    out = Path(output_dir)
    out.mkdir(exist_ok=True)

    with httpx.Client(timeout=30) as client:
        for post in posts:
            post_id = post.get('id', 'unknown')
            img_url = post.get('image_url')
            if img_url:
                try:
                    r = client.get(img_url)
                    r.raise_for_status()
                    (out / f'{post_id}.jpg').write_bytes(r.content)
                except Exception as e:
                    print(f'  Failed to download {post_id}: {e}')
            time.sleep(0.5)  # gentle on the CDN

Anti-Detection Best Practices

Instagram's detection systems look for these bot signals:

Signal	Risk	Mitigation
Datacenter IP	Critical	Use residential proxies
Fixed request intervals	High	Randomize delays 1.5-5s
High volume per session	High	Rotate session cookies
Same IP + multiple accounts	High	Separate IPs per session
Python default user agent	Medium	Use mobile app user agent
Sequential username scraping	Medium	Randomize order

For residential proxy rotation, ThorData provides pools of real residential IPs that distribute requests across different households and ISPs. Each request appears to come from a different user, which is the fundamental requirement for bypassing Instagram's per-IP rate limits.

# Proxy-aware request wrapper with retry logic
import time, random, requests
from typing import Optional

PROXY = {'http': 'http://USER:[email protected]:9000',
         'https': 'http://USER:[email protected]:9000'}

def instagram_get(url: str, params: dict = None, max_retries: int = 3) -> Optional[dict]:
    for attempt in range(max_retries):
        try:
            resp = requests.get(
                url,
                params=params,
                headers=HEADERS,
                cookies=COOKIES,
                proxies=PROXY,
                timeout=15,
            )
            if resp.status_code == 429:
                wait = (2 ** attempt) * 30  # exponential backoff: 30s, 60s, 120s
                print(f'  Rate limited — waiting {wait}s')
                time.sleep(wait)
                continue
            if resp.status_code == 401:
                print('  Session expired — need new session cookie')
                return None
            resp.raise_for_status()
            return resp.json()
        except requests.exceptions.ConnectionError:
            print(f'  Connection error on attempt {attempt+1}')
            time.sleep(5 * (attempt + 1))
        except Exception as e:
            print(f'  Request failed: {e}')
            return None
    return None

Legal Considerations

Using Instagram's private API violates Meta's Terms of Service. Meta has sued and settled with several scraping operations. Courts have generally ruled that scraping publicly accessible data is not a CFAA violation (hiQ v. LinkedIn), but Meta's position is more aggressive, especially for data accessed via circumvented authentication.

The safest approach: use the og:meta method for basic profile stats (bio, follower counts) — this is genuinely public data. Use the private API sparingly for research purposes and keep volume low. Never sell scraped Instagram data or build a product that depends on sustained large-scale scraping.

Summary

Instagram profile scraping in 2026 requires either the limited og:meta approach (no auth, basic stats) or the private mobile API (session required, full data). The mobile API returns comprehensive profile data — followers, following, post counts, verification status, business type, and the full post feed with engagement metrics.

Rate limits are around 100-200 requests per day per session. Rotate session cookies across multiple accounts, use ThorData's residential proxies to distribute IP-level load, randomize delays between requests, and download media immediately since CDN URLs expire within 24-48 hours.

Influencer Discovery and Scoring

The most common commercial use case for profile scraping is building influencer databases. Here is a scoring system that ranks profiles by likely brand-partnership value:

import statistics
from typing import Optional


def score_influencer(
    profile: dict,
    posts: list[dict],
    weight_reach: float = 0.3,
    weight_engagement: float = 0.5,
    weight_consistency: float = 0.2,
) -> dict:
    followers = profile.get("followers", 0)
    if followers < 1000:
        return {"score": 0, "tier": "nano", "reason": "Too small"}

    # Engagement rate
    if posts:
        recent = sorted(posts, key=lambda p: p.get("taken_at", 0), reverse=True)[:20]
        interactions = [p.get("like_count", 0) + p.get("comment_count", 0) for p in recent]
        avg_interactions = statistics.mean(interactions) if interactions else 0
        engagement_rate = (avg_interactions / followers) * 100
    else:
        engagement_rate = 0.0

    # Post consistency (posts per week, estimated from post timestamps)
    posting_frequency = 0
    if len(posts) >= 2:
        sorted_posts = sorted(posts, key=lambda p: p.get("taken_at", 0))
        oldest_ts = sorted_posts[0].get("taken_at", 0)
        newest_ts = sorted_posts[-1].get("taken_at", 0)
        if oldest_ts and newest_ts and newest_ts > oldest_ts:
            weeks_span = (newest_ts - oldest_ts) / (7 * 24 * 3600)
            posting_frequency = len(posts) / weeks_span if weeks_span > 0 else 0

    # Normalize scores 0-100
    # Reach score: log scale based on followers
    import math
    reach_score = min(100, math.log10(max(followers, 1)) / math.log10(10_000_000) * 100)

    # Engagement score: benchmarks vary by tier
    if followers < 10_000:
        eng_benchmark = 5.0   # nano: 5%+ is excellent
    elif followers < 100_000:
        eng_benchmark = 3.0   # micro: 3%+ is excellent
    elif followers < 1_000_000:
        eng_benchmark = 2.0   # mid: 2%+ is excellent
    else:
        eng_benchmark = 1.0   # mega: 1%+ is excellent
    eng_score = min(100, (engagement_rate / eng_benchmark) * 100)

    # Consistency score: 3-7 posts/week is ideal
    ideal_frequency = 5.0  # posts/week
    freq_score = min(100, (posting_frequency / ideal_frequency) * 100) if posting_frequency > 0 else 0

    overall_score = (
        weight_reach * reach_score
        + weight_engagement * eng_score
        + weight_consistency * freq_score
    )

    # Tier classification
    if followers < 10_000:
        tier = "nano"
    elif followers < 100_000:
        tier = "micro"
    elif followers < 500_000:
        tier = "mid"
    elif followers < 1_000_000:
        tier = "macro"
    else:
        tier = "mega"

    return {
        "score": round(overall_score, 1),
        "tier": tier,
        "followers": followers,
        "engagement_rate": round(engagement_rate, 3),
        "posting_frequency_weekly": round(posting_frequency, 1),
        "reach_score": round(reach_score, 1),
        "engagement_score": round(eng_score, 1),
        "consistency_score": round(freq_score, 1),
    }

Niche Detection from Bio and Caption Analysis

Automatically categorize profiles into niches using text analysis:

import re
from collections import Counter


NICHE_PATTERNS = {
    "fitness": r"\b(fitness|gym|workout|training|crossfit|yoga|nutrition|bodybuilding|weightlifting|hiit)\b",
    "travel": r"\b(travel|wanderlust|adventure|explore|backpacking|nomad|destination|itinerary|vacation)\b",
    "food": r"\b(food|recipe|cooking|chef|restaurant|foodie|gastronomy|cuisine|meal|delicious)\b",
    "tech": r"\b(tech|coding|programming|developer|software|startup|saas|ai|machine learning|data science)\b",
    "fashion": r"\b(fashion|style|outfit|ootd|streetwear|luxury|designer|brand|wear|clothing)\b",
    "beauty": r"\b(beauty|makeup|skincare|cosmetics|haircare|grooming|glam|tutorial|swatch)\b",
    "gaming": r"\b(gaming|gamer|esports|twitch|streamer|playstation|xbox|nintendo|fps|rpg)\b",
    "business": r"\b(entrepreneur|business|ceo|founder|startup|investment|finance|money|wealth)\b",
    "photography": r"\b(photography|photographer|photo|camera|lightroom|portrait|landscape|shoot)\b",
    "music": r"\b(music|musician|singer|producer|dj|band|album|concert|studio|lyrics)\b",
}


def detect_niche(profile: dict, posts: list[dict] = None) -> dict:
    bio = (profile.get("biography") or "").lower()
    captions = " ".join(p.get("caption", "") for p in (posts or [])[:20]).lower()
    combined_text = bio + " " + captions

    niche_scores = {}
    for niche, pattern in NICHE_PATTERNS.items():
        matches = re.findall(pattern, combined_text, re.IGNORECASE)
        niche_scores[niche] = len(matches)

    if not any(niche_scores.values()):
        return {"primary_niche": "general", "confidence": 0, "all_scores": niche_scores}

    primary = max(niche_scores, key=niche_scores.get)
    max_score = niche_scores[primary]
    total_score = sum(niche_scores.values())
    confidence = (max_score / total_score * 100) if total_score > 0 else 0

    return {
        "primary_niche": primary,
        "confidence": round(confidence, 1),
        "all_scores": niche_scores,
    }

Profile Comparison and Benchmarking

Compare a set of profiles against each other to identify top performers in a niche:

import json
from pathlib import Path
from datetime import datetime


def benchmark_profiles(profiles_dir: str) -> list[dict]:
    profiles_path = Path(profiles_dir)
    profile_files = list(profiles_path.glob("*.json"))

    all_profiles = []
    for f in profile_files:
        try:
            data = json.loads(f.read_text())
            if "username" in data:
                all_profiles.append(data)
        except Exception:
            pass

    if not all_profiles:
        return []

    # Add scores
    scored_profiles = []
    for profile in all_profiles:
        posts = profile.get("posts", [])
        score_data = score_influencer(profile, posts)
        niche_data = detect_niche(profile, posts)

        scored_profiles.append({
            **profile,
            "score": score_data.get("score", 0),
            "tier": score_data.get("tier"),
            "engagement_rate": score_data.get("engagement_rate"),
            "primary_niche": niche_data.get("primary_niche"),
        })

    # Sort by score
    scored_profiles.sort(key=lambda x: x.get("score", 0), reverse=True)

    print(f"\nProfile Benchmark ({len(scored_profiles)} profiles):")
    print(f"{'Username':<25} {'Followers':>10} {'Eng Rate':>9} {'Score':>7} {'Tier':<8} {'Niche'}")
    print("-" * 80)
    for p in scored_profiles[:20]:
        followers = p.get("followers", 0)
        eng = p.get("engagement_rate") or 0
        score = p.get("score", 0)
        print(
            f"{p.get('username', 'N/A'):<25} {followers:>10,} "
            f"{eng:>8.2f}% {score:>7.1f} {p.get('tier', 'N/A'):<8} {p.get('primary_niche', 'N/A')}"
        )

    return scored_profiles

Follower Growth Tracking

To track follower growth over time, take periodic snapshots:

import json
import time
import random
import requests
from pathlib import Path
from datetime import datetime

GROWTH_DB = Path("instagram_growth_tracker.json")


def load_growth_data() -> dict:
    if GROWTH_DB.exists():
        return json.loads(GROWTH_DB.read_text())
    return {}


def save_growth_data(data: dict) -> None:
    GROWTH_DB.write_text(json.dumps(data, indent=2))


def snapshot_profiles(
    usernames: list[str],
    session_id: str,
    proxy_url: str = None,
) -> None:
    growth_data = load_growth_data()
    now = datetime.now().isoformat()

    headers = {
        "User-Agent": "Instagram 317.0.0.34.109 Android (30/11; 420dpi; 1080x2220; samsung; SM-G991B; o1s; exynos2100)",
        "X-IG-App-ID": "936619743392459",
    }
    cookies = {"sessionid": session_id}
    proxies = {"http": proxy_url, "https": proxy_url} if proxy_url else None

    for username in usernames:
        resp = requests.get(
            "https://i.instagram.com/api/v1/users/web_profile_info/",
            params={"username": username},
            headers=headers,
            cookies=cookies,
            proxies=proxies,
            timeout=15,
        )

        if resp.status_code == 429:
            print(f"  Rate limited — waiting 60s")
            time.sleep(60)
            continue

        if resp.status_code != 200:
            print(f"  Failed @{username}: {resp.status_code}")
            continue

        user = resp.json().get("data", {}).get("user", {})
        followers = user.get("edge_followed_by", {}).get("count", 0)
        posts = user.get("edge_owner_to_timeline_media", {}).get("count", 0)

        if username not in growth_data:
            growth_data[username] = []

        growth_data[username].append({
            "timestamp": now,
            "followers": followers,
            "post_count": posts,
        })

        print(f"  @{username}: {followers:,} followers")
        time.sleep(random.uniform(2, 4))

    save_growth_data(growth_data)


def calculate_follower_growth(username: str, days: int = 30) -> dict | None:
    growth_data = load_growth_data()
    snapshots = growth_data.get(username, [])
    if len(snapshots) < 2:
        return None

    sorted_snaps = sorted(snapshots, key=lambda x: x["timestamp"])

    # Get first and last in window
    from datetime import timedelta
    cutoff = (datetime.now() - timedelta(days=days)).isoformat()
    window = [s for s in sorted_snaps if s["timestamp"] >= cutoff]
    if len(window) < 2:
        window = sorted_snaps[-2:]

    first = window[0]
    last = window[-1]
    follower_gain = last["followers"] - first["followers"]
    days_elapsed = (
        datetime.fromisoformat(last["timestamp"]) - datetime.fromisoformat(first["timestamp"])
    ).total_seconds() / 86400

    return {
        "username": username,
        "current_followers": last["followers"],
        "follower_gain": follower_gain,
        "daily_growth_avg": round(follower_gain / days_elapsed, 1) if days_elapsed > 0 else 0,
        "growth_rate_pct": round(follower_gain / first["followers"] * 100, 3) if first["followers"] > 0 else 0,
        "days_measured": round(days_elapsed, 1),
    }

Integration with ThorData for Scale

When building a profile monitoring system that tracks hundreds of accounts, the volume quickly exceeds what a single session and IP can handle. Here is a production-ready integration with ThorData's residential proxy pool:

import requests
import random
import time
from typing import Optional

# ThorData residential proxy — https://thordata.partnerstack.com/partner/0a0x4nzb (or [Oxylabs](https://oxylabs.go2cloud.org/aff_c?offer_id=7&aff_id=2066&url_id=174))
THORDATA_USER = "your_user"
THORDATA_PASS = "your_pass"
THORDATA_HOST = "proxy.thordata.net"
THORDATA_PORT = 9000


def get_proxy_url(country_code: str = "us") -> str:
    return f"http://{THORDATA_USER}:{THORDATA_PASS}-cc-{country_code}@{THORDATA_HOST}:{THORDATA_PORT}"


class InstagramProfileCollector:
    def __init__(self, sessions: list[str], use_proxies: bool = True):
        self.sessions = sessions
        self.use_proxies = use_proxies
        self.session_idx = 0
        self.request_counts = {s: 0 for s in sessions}
        self.countries = ["us", "gb", "ca", "au", "de"]

    def _get_next_session(self) -> str:
        # Find least-used available session
        available = [
            s for s in self.sessions
            if self.request_counts.get(s, 0) < 80
        ]
        if not available:
            raise RuntimeError("All sessions at capacity")
        return min(available, key=lambda s: self.request_counts.get(s, 0))

    def _get_proxy(self) -> dict | None:
        if not self.use_proxies:
            return None
        country = random.choice(self.countries)
        proxy_url = get_proxy_url(country)
        return {"http": proxy_url, "https": proxy_url}

    def fetch_profile(self, username: str) -> Optional[dict]:
        session_id = self._get_next_session()
        proxies = self._get_proxy()

        headers = {
            "User-Agent": "Instagram 317.0.0.34.109 Android (30/11; 420dpi; 1080x2220; samsung; SM-G991B; o1s; exynos2100)",
            "X-IG-App-ID": "936619743392459",
        }
        cookies = {"sessionid": session_id}

        try:
            resp = requests.get(
                "https://i.instagram.com/api/v1/users/web_profile_info/",
                params={"username": username},
                headers=headers,
                cookies=cookies,
                proxies=proxies,
                timeout=15,
            )
            if resp.status_code == 429:
                self.request_counts[session_id] = 999  # mark as exhausted
                return None
            resp.raise_for_status()
            self.request_counts[session_id] = self.request_counts.get(session_id, 0) + 1
            return resp.json()
        except Exception as e:
            print(f"  Error fetching @{username}: {e}")
            return None

    def collect_profiles(
        self,
        usernames: list[str],
        delay_range: tuple = (2.0, 5.0),
    ) -> list[dict]:
        results = []
        for i, username in enumerate(usernames):
            print(f"Fetching @{username} ({i+1}/{len(usernames)})")
            raw = self.fetch_profile(username)
            if raw:
                profile = extract_profile_data(raw)
                results.append(profile)
                print(f"  {profile.get('followers', 0):,} followers")
            time.sleep(random.uniform(*delay_range))
        return results

Output Formats and Integration

Profile data integrates well with business intelligence tools, CRM systems, and spreadsheets:

import csv
import json
from pathlib import Path
from datetime import datetime


def export_profiles(
    profiles: list[dict],
    output_dir: str = "instagram_exports",
    include_posts: bool = False,
) -> dict:
    out = Path(output_dir)
    out.mkdir(exist_ok=True)
    timestamp = datetime.now().strftime("%Y%m%d_%H%M")

    # JSON (full data)
    json_file = out / f"profiles_{timestamp}.json"
    json_file.write_text(json.dumps(profiles, indent=2, ensure_ascii=False))

    # CSV (summary, without nested post data)
    csv_file = out / f"profiles_{timestamp}.csv"
    csv_fields = [
        "username", "full_name", "biography", "website",
        "followers", "following", "post_count",
        "is_verified", "is_private", "is_business",
        "business_category", "profile_pic_url",
    ]
    with open(csv_file, "w", newline="", encoding="utf-8") as f:
        writer = csv.DictWriter(f, fieldnames=csv_fields, extrasaction="ignore")
        writer.writeheader()
        for profile in profiles:
            # Flatten for CSV
            row = {k: profile.get(k) for k in csv_fields}
            writer.writerow(row)

    print(f"Exported {len(profiles)} profiles:")
    print(f"  JSON: {json_file}")
    print(f"  CSV:  {csv_file}")

    return {"json": str(json_file), "csv": str(csv_file), "count": len(profiles)}

Audience Demographics Inference

While Instagram's API does not expose demographic data for arbitrary profiles, you can infer demographics from the content and engagement patterns:

import re
from collections import Counter


def infer_audience_demographics(posts: list[dict], profile: dict) -> dict:
    bio = (profile.get("biography") or "").lower()
    captions = " ".join(p.get("caption", "") for p in posts[:30]).lower()
    combined = bio + " " + captions

    # Age indicator keywords
    age_signals = {
        "gen_z": r"\b(gen z|genz|zoomer|tiktok|discord|stan|slay|vibe|aesthetic|no cap|fr fr)\b",
        "millennial": r"\b(millennial|adulting|throwback|nostalgia|90s|2000s|hustle culture|wine mom)\b",
        "professional": r"\b(ceo|founder|executive|linkedin|b2b|enterprise|corporate|professional)\b",
        "parent": r"\b(mom|dad|parent|toddler|kids|parenting|family|babywearing|breastfeeding)\b",
    }

    age_scores = {group: len(re.findall(pattern, combined, re.IGNORECASE))
                  for group, pattern in age_signals.items()}

    # Gender indicator (rough)
    gender_signals = {
        "female": r"\b(she/her|woman|girl|sis|ladies|feminist|girly|sisterhood|babe)\b",
        "male": r"\b(he/him|man|guy|bro|brotherhood|masculine|gentlemen)\b",
    }
    gender_scores = {g: len(re.findall(p, combined, re.IGNORECASE))
                     for g, p in gender_signals.items()}

    # Interest inference
    interest_signals = {
        "sports": r"\b(athlete|fitness|gym|sport|training|marathon|crossfit|swim|run|cycle)\b",
        "lifestyle": r"\b(lifestyle|wellness|mindset|self.care|routine|morning|journal)\b",
        "creative": r"\b(art|design|creative|photography|illustration|craft|sketch|drawing)\b",
        "entrepreneur": r"\b(entrepreneur|startup|founder|business|passive income|side hustle)\b",
    }
    interest_scores = {interest: len(re.findall(pattern, combined, re.IGNORECASE))
                       for interest, pattern in interest_signals.items()}

    return {
        "likely_age_group": max(age_scores, key=age_scores.get) if any(age_scores.values()) else "unknown",
        "age_signals": age_scores,
        "likely_gender": max(gender_scores, key=gender_scores.get) if any(gender_scores.values()) else "unknown",
        "top_interests": sorted(interest_scores.items(), key=lambda x: x[1], reverse=True)[:3],
    }

Reels Performance vs. Static Posts

A key strategic insight for content creators: comparing Reel performance against static post performance:

import statistics
from collections import defaultdict


def compare_content_type_performance(
    posts: list[dict],
    reels: list[dict],
) -> dict:
    # Static posts (photos only)
    photos = [p for p in posts if p.get("media_type") == "photo"]
    carousels = [p for p in posts if p.get("media_type") == "carousel"]
    videos = [p for p in posts if p.get("media_type") == "video"]

    def calc_stats(items: list[dict]) -> dict:
        if not items:
            return {"count": 0}
        likes = [item.get("like_count", 0) for item in items]
        comments = [item.get("comment_count", 0) for item in items]
        views = [item.get("view_count", 0) for item in items if item.get("view_count")]
        return {
            "count": len(items),
            "avg_likes": round(statistics.mean(likes), 0),
            "median_likes": round(statistics.median(likes), 0),
            "avg_comments": round(statistics.mean(comments), 1),
            "avg_views": round(statistics.mean(views), 0) if views else None,
        }

    reel_stats = calc_stats(reels)
    photo_stats = calc_stats(photos)
    carousel_stats = calc_stats(carousels)
    video_stats = calc_stats(videos)

    # Calculate reels multiplier vs. photos
    reels_vs_photos = None
    if reel_stats.get("avg_likes") and photo_stats.get("avg_likes") and photo_stats["avg_likes"] > 0:
        reels_vs_photos = round(reel_stats["avg_likes"] / photo_stats["avg_likes"], 2)

    result = {
        "photos": photo_stats,
        "carousels": carousel_stats,
        "videos": video_stats,
        "reels": reel_stats,
        "reels_vs_photos_multiplier": reels_vs_photos,
    }

    print("Content type performance comparison:")
    for content_type, stats in result.items():
        if isinstance(stats, dict) and stats.get("count"):
            print(f"  {content_type:<12}: {stats['count']} posts, "
                  f"{stats.get('avg_likes', 0):.0f} avg likes, "
                  f"{stats.get('avg_comments', 0):.1f} avg comments")

    if reels_vs_photos:
        print(f"\n  Reels get {reels_vs_photos}x more likes than static photos for this account")

    return result

Monitoring Profile Changes

Detect when a competitor account changes their bio, adds a link, or changes their profile picture — useful for competitive intelligence:

import json
import hashlib
import requests
from pathlib import Path
from datetime import datetime


PROFILE_HISTORY = Path("profile_change_history.json")


def load_profile_history() -> dict:
    if PROFILE_HISTORY.exists():
        return json.loads(PROFILE_HISTORY.read_text())
    return {}


def save_profile_history(data: dict) -> None:
    PROFILE_HISTORY.write_text(json.dumps(data, indent=2))


def profile_fingerprint(profile: dict) -> str:
    key_fields = {
        "biography": profile.get("biography", ""),
        "website": profile.get("website", ""),
        "followers": profile.get("followers", 0),
        "post_count": profile.get("post_count", 0),
        "is_verified": profile.get("is_verified", False),
    }
    return hashlib.md5(json.dumps(key_fields, sort_keys=True).encode()).hexdigest()


def detect_profile_changes(
    username: str,
    session_id: str,
    proxy_url: str = None,
) -> list[str]:
    history = load_profile_history()
    now = datetime.now().isoformat()

    # Fetch current profile
    headers = {
        "User-Agent": "Instagram 317.0.0.34.109 Android (30/11; 420dpi; 1080x2220; samsung; SM-G991B; o1s; exynos2100)",
        "X-IG-App-ID": "936619743392459",
    }
    cookies = {"sessionid": session_id}
    proxies = {"http": proxy_url, "https": proxy_url} if proxy_url else None

    resp = requests.get(
        "https://i.instagram.com/api/v1/users/web_profile_info/",
        params={"username": username},
        headers=headers,
        cookies=cookies,
        proxies=proxies,
        timeout=15,
    )
    if resp.status_code != 200:
        return []

    current_profile = extract_profile_data(resp.json())
    current_fp = profile_fingerprint(current_profile)

    changes = []
    if username in history:
        last = history[username]["last_snapshot"]
        last_fp = history[username]["fingerprint"]

        if current_fp != last_fp:
            # Detect specific changes
            for field in ["biography", "website", "is_verified"]:
                if last.get(field) != current_profile.get(field):
                    changes.append(f"{field} changed: '{last.get(field)}' -> '{current_profile.get(field)}'")

            follower_diff = current_profile.get("followers", 0) - last.get("followers", 0)
            if abs(follower_diff) > 1000:
                changes.append(f"Follower count changed by {follower_diff:+,}")

            if changes:
                print(f"Changes detected for @{username}:")
                for change in changes:
                    print(f"  - {change}")

    history[username] = {
        "fingerprint": current_fp,
        "last_snapshot": current_profile,
        "last_checked": now,
    }
    save_profile_history(history)

    return changes

Performance Optimization for Bulk Collection

When collecting hundreds of profiles, optimize throughput while respecting rate limits:

import asyncio
import json
import time
import random
import httpx
from pathlib import Path
from datetime import datetime


async def collect_profiles_async(
    usernames: list[str],
    sessions: list[str],
    proxy_url: str = None,
    concurrency: int = 3,
    delay_range: tuple = (2.0, 5.0),
) -> list[dict]:
    sem = asyncio.Semaphore(concurrency)
    session_idx = 0
    results = []

    async def fetch_one(username: str) -> dict | None:
        nonlocal session_idx
        session_id = sessions[session_idx % len(sessions)]
        session_idx += 1

        headers = {
            "User-Agent": "Instagram 317.0.0.34.109 Android (30/11; 420dpi; 1080x2220; samsung; SM-G991B; o1s; exynos2100)",
            "X-IG-App-ID": "936619743392459",
        }
        cookies = {"sessionid": session_id}

        proxy = proxy_url

        async with sem:
            try:
                transport = httpx.AsyncHTTPTransport(proxy=proxy) if proxy else None
                async with httpx.AsyncClient(transport=transport) as client:
                    resp = await client.get(
                        "https://i.instagram.com/api/v1/users/web_profile_info/",
                        params={"username": username},
                        headers=headers,
                        cookies=cookies,
                        timeout=15,
                    )
                    if resp.status_code == 429:
                        print(f"  Rate limited for @{username}")
                        return None
                    if resp.status_code != 200:
                        return None
                    return extract_profile_data(resp.json())
            except Exception as e:
                print(f"  Error for @{username}: {e}")
                return None
            finally:
                await asyncio.sleep(random.uniform(*delay_range))

    tasks = [fetch_one(u) for u in usernames]
    raw_results = await asyncio.gather(*tasks)
    return [r for r in raw_results if r]


# For running async from sync context
def collect_profiles_parallel(
    usernames: list[str],
    sessions: list[str],
    proxy_url: str = None,
) -> list[dict]:
    # https://thordata.partnerstack.com/partner/0a0x4nzb (or [Oxylabs](https://oxylabs.go2cloud.org/aff_c?offer_id=7&aff_id=2066&url_id=174))
    return asyncio.run(
        collect_profiles_async(usernames, sessions, proxy_url=proxy_url, concurrency=3)
    )