graphstrike/models.py at main · SaiNivedh26/graphstrike · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
from __future__ import annotations

from enum import Enum
from typing import Dict, List, Optional

from pydantic import BaseModel

# ---------------------------------------------------------------------------
# OpenEnv base types
# Use real SDK when available; fall back to stubs for local dev without SDK.
# ---------------------------------------------------------------------------
try:
    from openenv.core.env_server import Action, Observation, State  # type: ignore
except ImportError:
    class Action(BaseModel):  # type: ignore[no-redef]
        pass

    class Observation(BaseModel):  # type: ignore[no-redef]
        done: bool = False
        reward: Optional[float] = None

    class State(BaseModel):  # type: ignore[no-redef]
        episode_id: str = ""
        step_count: int = 0

# ---------------------------------------------------------------------------
# Domain types
# ---------------------------------------------------------------------------

class ActionType(str, Enum):
    INSPECT = "inspect"                       # reveal full profile + edges, costs 1 step
    INVESTIGATE_NETWORK = "investigate_network"  # expand graph 1 hop, costs 2 steps
    FLAG = "flag"                             # mark account fake (free)
    UNFLAG = "unflag"                         # unmark account (free)
    SUBMIT = "submit"                         # end episode, trigger scoring
    # Round 2: New tool-call actions
    REVERSE_IMAGE_SEARCH = "reverse_image_search"  # reveal photo_reuse_score, costs 1 step
    ANALYZE_BIO = "analyze_bio"               # reveal bio_template_score, costs 1 step
    CHECK_IP = "check_ip"                     # reveal ip_cluster_signal, costs 2 steps
    GET_POLICY = "get_policy"                 # get platform policy, costs 0 steps


class AccountStatus(str, Enum):
    NORMAL = "normal"
    SUSPECT = "suspect"           # auto-elevated when a neighbor is flagged
    CONFIRMED_FAKE = "confirmed_fake"  # agent explicitly flagged this account


class FakeGangAction(Action):
    action_type: ActionType
    account_id: Optional[str] = None  # required for all actions except SUBMIT


class AccountProfile(BaseModel):
    account_id: str
    follower_count: int
    following_count: int
    post_count: int
    avg_post_hour: float        # 0–23
    photo_reuse_score: float    # 0–1 — pre-computed: fraction of posts using stolen celebrity photos
    bio_template_score: float   # 0–1 — pre-computed: cosine similarity to known fake bio templates
    account_age_days: int
    name_change_count: int = 0  # incremented by hard-mode evasion events

    # ── Derived graph features (computed at INSPECT time from live graph state) ──
    flagged_neighbor_count: int = 0    # how many of this account's follows are currently flagged
                                       # high value = deep inside a cluster you're already tracking
    mutual_follow_rate: float = 0.0    # fraction of follows that also follow back (0–1)
                                       # real fans: low; fake gangs: high (they mutually inflate each other)
    avg_neighbor_photo_reuse: float = 0.0  # mean photo_reuse_score of inspected follows
                                           # gang members cluster: if neighbors are fake, this is high

    visible_follows: List[str] = []    # IDs of accounts this account follows (revealed by INSPECT)

    # ── Account status ──
    status: AccountStatus = AccountStatus.NORMAL

    # ── Full risk breakdown (computed via scoring.py at INSPECT time) ──
    fake_risk_score: float = 0.0
    node_risk: float = 0.0
    behavior_risk: float = 0.0
    graph_risk: float = 0.0
    hub_legitimacy_score: float = 0.0

    # ── New raw features (from generator) ──
    comment_repeat_score: float = 0.0   # fakes: 0.6-0.9 | decoys: 0.1-0.3 | reals: 0.0-0.08
    shared_ip_count: int = 0            # fakes: 9 (gang shares 1 IP) | reals: 0-1

    # ── Extended runtime graph features ──
    inspected_neighbor_count: int = 0   # denominator for flagged_neighbor_ratio
    post_hour_cluster_score: float = 0.0  # hour alignment to flagged cluster mean
    suspicious_mutual_ratio: float = 0.0  # used in hub legitimacy computation


class FakeGangObservation(Observation):
    visible_accounts: List[AccountProfile] = []
    visible_account_ids: List[str] = []   # all account IDs the agent knows exist
    flagged_ids: List[str] = []
    inspected_ids: List[str] = []
    graph_edges: Dict[str, List[str]] = {}  # account_id -> list of accounts it follows
    steps_remaining: int = 0
    evasion_triggered: bool = False
    evasion_count: int = 0
    task: str = "easy"
    message: str = ""
    suspect_ids: List[str] = []  # auto-elevated neighbors of flagged accounts
    platform: str = ""  # Round 2: Platform name (Instagram/Snapchat) - passed from state


class FakeGangState(State):
    task: str = "easy"
    score_so_far: float = 0.0
    evasion_count: int = 0
    network_size: int = 0
    gang_size: int = 10
    episode_seed: int = 0
    platform: str = ""  # Round 2: Platform name (Instagram/Snapchat)


# ---------------------------------------------------------------------------
# Round 2: Platform Policy Model
# ---------------------------------------------------------------------------

class PlatformPolicy(BaseModel):
    """Dynamically compiled platform policy from transparency reports."""
    platform: str                    # "Instagram" or "Snapchat"
    threshold: float                 # θ* - computed Bayesian threshold for flagging
    base_rate: float                 # π - prevalence of fake accounts
    fn_cost_signal: str              # "low" | "medium" | "high" | "critical"
    fp_cost_signal: str              # "low" | "medium" | "high"
    harm_weight: float               # enforcement vs creator balance (0.5-2.0)
    primary_enforcement_signal: str  # "photo_reuse" | "bio_template" | "ip_cluster"
    fp_penalty_weight: float         # C_fp for reward function
    sources: List[str] = []          # URLs used for extraction
    confidence: float = 0.0          # LLM extraction confidence (0.0-1.0)
    compiled_at: str = ""            # ISO timestamp
    used_fallback: bool = False      # True if fallback policy was used due to extraction failure