-
Notifications
You must be signed in to change notification settings - Fork 4
Expand file tree
/
Copy pathmodels.py
More file actions
137 lines (111 loc) · 6.17 KB
/
models.py
File metadata and controls
137 lines (111 loc) · 6.17 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
from __future__ import annotations
from enum import Enum
from typing import Dict, List, Optional
from pydantic import BaseModel
# ---------------------------------------------------------------------------
# OpenEnv base types
# Use real SDK when available; fall back to stubs for local dev without SDK.
# ---------------------------------------------------------------------------
try:
from openenv.core.env_server import Action, Observation, State # type: ignore
except ImportError:
class Action(BaseModel): # type: ignore[no-redef]
pass
class Observation(BaseModel): # type: ignore[no-redef]
done: bool = False
reward: Optional[float] = None
class State(BaseModel): # type: ignore[no-redef]
episode_id: str = ""
step_count: int = 0
# ---------------------------------------------------------------------------
# Domain types
# ---------------------------------------------------------------------------
class ActionType(str, Enum):
INSPECT = "inspect" # reveal full profile + edges, costs 1 step
INVESTIGATE_NETWORK = "investigate_network" # expand graph 1 hop, costs 2 steps
FLAG = "flag" # mark account fake (free)
UNFLAG = "unflag" # unmark account (free)
SUBMIT = "submit" # end episode, trigger scoring
# Round 2: New tool-call actions
REVERSE_IMAGE_SEARCH = "reverse_image_search" # reveal photo_reuse_score, costs 1 step
ANALYZE_BIO = "analyze_bio" # reveal bio_template_score, costs 1 step
CHECK_IP = "check_ip" # reveal ip_cluster_signal, costs 2 steps
GET_POLICY = "get_policy" # get platform policy, costs 0 steps
class AccountStatus(str, Enum):
NORMAL = "normal"
SUSPECT = "suspect" # auto-elevated when a neighbor is flagged
CONFIRMED_FAKE = "confirmed_fake" # agent explicitly flagged this account
class FakeGangAction(Action):
action_type: ActionType
account_id: Optional[str] = None # required for all actions except SUBMIT
class AccountProfile(BaseModel):
account_id: str
follower_count: int
following_count: int
post_count: int
avg_post_hour: float # 0–23
photo_reuse_score: float # 0–1 — pre-computed: fraction of posts using stolen celebrity photos
bio_template_score: float # 0–1 — pre-computed: cosine similarity to known fake bio templates
account_age_days: int
name_change_count: int = 0 # incremented by hard-mode evasion events
# ── Derived graph features (computed at INSPECT time from live graph state) ──
flagged_neighbor_count: int = 0 # how many of this account's follows are currently flagged
# high value = deep inside a cluster you're already tracking
mutual_follow_rate: float = 0.0 # fraction of follows that also follow back (0–1)
# real fans: low; fake gangs: high (they mutually inflate each other)
avg_neighbor_photo_reuse: float = 0.0 # mean photo_reuse_score of inspected follows
# gang members cluster: if neighbors are fake, this is high
visible_follows: List[str] = [] # IDs of accounts this account follows (revealed by INSPECT)
# ── Account status ──
status: AccountStatus = AccountStatus.NORMAL
# ── Full risk breakdown (computed via scoring.py at INSPECT time) ──
fake_risk_score: float = 0.0
node_risk: float = 0.0
behavior_risk: float = 0.0
graph_risk: float = 0.0
hub_legitimacy_score: float = 0.0
# ── New raw features (from generator) ──
comment_repeat_score: float = 0.0 # fakes: 0.6-0.9 | decoys: 0.1-0.3 | reals: 0.0-0.08
shared_ip_count: int = 0 # fakes: 9 (gang shares 1 IP) | reals: 0-1
# ── Extended runtime graph features ──
inspected_neighbor_count: int = 0 # denominator for flagged_neighbor_ratio
post_hour_cluster_score: float = 0.0 # hour alignment to flagged cluster mean
suspicious_mutual_ratio: float = 0.0 # used in hub legitimacy computation
class FakeGangObservation(Observation):
visible_accounts: List[AccountProfile] = []
visible_account_ids: List[str] = [] # all account IDs the agent knows exist
flagged_ids: List[str] = []
inspected_ids: List[str] = []
graph_edges: Dict[str, List[str]] = {} # account_id -> list of accounts it follows
steps_remaining: int = 0
evasion_triggered: bool = False
evasion_count: int = 0
task: str = "easy"
message: str = ""
suspect_ids: List[str] = [] # auto-elevated neighbors of flagged accounts
platform: str = "" # Round 2: Platform name (Instagram/Snapchat) - passed from state
class FakeGangState(State):
task: str = "easy"
score_so_far: float = 0.0
evasion_count: int = 0
network_size: int = 0
gang_size: int = 10
episode_seed: int = 0
platform: str = "" # Round 2: Platform name (Instagram/Snapchat)
# ---------------------------------------------------------------------------
# Round 2: Platform Policy Model
# ---------------------------------------------------------------------------
class PlatformPolicy(BaseModel):
"""Dynamically compiled platform policy from transparency reports."""
platform: str # "Instagram" or "Snapchat"
threshold: float # θ* - computed Bayesian threshold for flagging
base_rate: float # π - prevalence of fake accounts
fn_cost_signal: str # "low" | "medium" | "high" | "critical"
fp_cost_signal: str # "low" | "medium" | "high"
harm_weight: float # enforcement vs creator balance (0.5-2.0)
primary_enforcement_signal: str # "photo_reuse" | "bio_template" | "ip_cluster"
fp_penalty_weight: float # C_fp for reward function
sources: List[str] = [] # URLs used for extraction
confidence: float = 0.0 # LLM extraction confidence (0.0-1.0)
compiled_at: str = "" # ISO timestamp
used_fallback: bool = False # True if fallback policy was used due to extraction failure