RCL model added (context generated based on relations only)

LckyLke · LckyLke · commit 771b99f9bd52 · 2025-03-18T14:50:27.000+01:00
diff --git a/retrieval_augmented_link_predictor.py b/retrieval_augmented_link_predictor.py
@@ -647,6 +647,124 @@ def forward_triples(self, indexed_triples: torch.LongTensor):
         return torch.FloatTensor(scores)
 
 
+class RCL(AbstractBaseLinkPredictorClass):
+    """ Relation-based Context Learning to predict missing entities.
+
+    (h, r, t) ∈ G_test
+
+    1. Use all triples from G_train involving relation r to create context.
+    2. Generate a prompt based on these triples and (h,r) to assign scores for all e ∈ E.
+    """
+    def __init__(self, knowledge_graph: KG = None, base_url:str=None, api_key:str=None, llm_model:str=None,
+                 temperature:float=0.0, seed:int=42, max_relation_examples:int=50, use_val:bool=True, 
+                 exclude_source:bool=True) -> None:
+        super().__init__(knowledge_graph, name="RCL")
+        assert base_url is not None and isinstance(base_url, str)
+        self.base_url = base_url
+        self.api_key = api_key
+        self.llm_model = llm_model
+        self.temperature = temperature
+        self.seed = seed
+        self.max_relation_examples = max_relation_examples
+        self.exclude_source = exclude_source
+        self.client = OpenAI(base_url=self.base_url, api_key=self.api_key)
+        
+        # Training dataset
+        self.train_set:List[Tuple[str]] = [(self.idx_to_entity[idx_h],
+                                            self.idx_to_relation[idx_r],
+                                            self.idx_to_entity[idx_t]) for idx_h,idx_r,idx_t in self.kg.train_set.tolist()]
+        # Validation dataset
+        self.val_set:List[Tuple[str]] = [(self.idx_to_entity[idx_h],
+                                          self.idx_to_relation[idx_r],
+                                          self.idx_to_entity[idx_t]) for idx_h,idx_r,idx_t in self.kg.valid_set.tolist()]
+
+        triples = self.train_set + self.val_set if use_val else self.train_set
+        
+        # Create a mapping from relation to all triples using that relation
+        self.relation_to_triples = {}
+        for s, p, o in triples:
+            if p not in self.relation_to_triples:
+                self.relation_to_triples[p] = []
+            self.relation_to_triples[p].append((s, p, o))
+            
+        self.target_entities = list(sorted(self.entity_to_idx.keys()))
+
+    def _create_prompt_based_on_relation(self, source: str, relation: str) -> str:
+        # Get all triples with the current relation
+        relation_triples = []
+        if relation in self.relation_to_triples:
+            relation_triples = self.relation_to_triples[relation]
+            
+            # Exclude triples where the source entity is the current one if flag is set
+            if self.exclude_source:
+                relation_triples = [triple for triple in relation_triples if triple[0] != source]
+            
+            # Limit examples if too many
+            if len(relation_triples) > self.max_relation_examples:
+                relation_triples = relation_triples[:self.max_relation_examples]
+                
+        relation_context = "Here are examples of how the relation is used in the knowledge base:\n"
+        for s, p, o in sorted(relation_triples):
+            relation_context += f"- {s} {p} {o}\n"
+        relation_context += "\n"
+
+        base_prompt = f"""
+        I'm trying to predict the most likely target entities for the following query:
+        Source entity: {source}
+        Relation: {relation}
+        Query: ({source}, {relation}, ?)
+
+        {relation_context}
+        
+        Please provide a ranked list of at most {min(len(self.target_entities),15)} likely target entities from the following list, along with likelihoods for each: {self.target_entities}
+    
+        Provide your answer in the following JSON format: {{"predictions": [{{"entity": "entity_name", "score": float_number}}]}}
+
+        Notes:
+        1. Use the provided knowledge about how the relation is used to inform your predictions.
+        2. Only include entities that are plausible targets for this relation.
+        3. For geographic entities, consider geographic location, regional classifications, and political associations.
+        4. Rank the entities by likelihood of being the correct target.
+        5. ONLY INCLUDE entities from the provided list in your predictions.
+        6. If certain entities are not suitable for this relation, don't include them.
+        7. Return a valid JSON output.
+        8. Make sure scores are floating point numbers between 0 and 1, not strings.
+        9. A score can only be between 0 and 1, i.e. score ∈ [0, 1]. They can never be negative or greater than 1!
+        """
+        return base_prompt
+
+    def forward_triples(self, x: torch.LongTensor):
+        raise NotImplementedError("RCL needs to implement it")
+
+    def forward_k_vs_all(self,x: torch.LongTensor) -> torch.FloatTensor:
+        batch_output = []
+        # Iterate over batch of subject and relation pairs
+        for i in x.tolist():
+            # index of an entity and index of a relation.
+            idx_h, idx_r = i
+            # String representations of an entity and a relation, respectively.
+            h, r = self.idx_to_entity[idx_h], self.idx_to_relation[idx_r]
+            llm_response = self.client.chat.completions.create(
+                model=self.llm_model, temperature=self.temperature, seed=self.seed,
+                messages=[{"role": "user",
+                           "content": "You are a knowledgeable assistant that helps with link prediction tasks.\n" +
+                                      self._create_prompt_based_on_relation(source=h, relation=r)}],
+                extra_body={"guided_json": PredictionResponse.model_json_schema(),
+                            "truncate_prompt_tokens": 30_000,
+                            }).choices[0].message.content
+
+            prediction_response = PredictionResponse(**json.loads(llm_response))
+            # Initialize scores for all entities
+            scores_for_all_entities = [ -1.0 for _ in range(len(self.idx_to_entity))]
+            for pred in prediction_response.predictions:
+                try:
+                    scores_for_all_entities[self.entity_to_idx[pred.entity]]=pred.score
+                except KeyError:
+                    print(f"For {h},{r}, {pred} not found\tPrediction Size: {len(prediction_response.predictions)}")
+                    continue
+            batch_output.append(scores_for_all_entities)
+        return torch.FloatTensor(batch_output)
+
 def sanity_checking(args,kg):
     if args.eval_size is not None:
         assert len(kg.test_set) >= args.eval_size, (f"Evaluation size cant be greater than the "
@@ -664,6 +782,10 @@ def get_model(args,kg)->AbstractBaseLinkPredictorClass:
     elif args.model == "GCL":
         model = GCL(knowledge_graph=kg, base_url=args.base_url, api_key=args.api_key,
                     llm_model=args.llm_model_name, temperature=args.temperature, seed=args.seed,num_of_hops=args.num_of_hops)
+    elif args.model == "RCL":
+        model = RCL(knowledge_graph=kg, base_url=args.base_url, api_key=args.api_key,
+                    llm_model=args.llm_model_name, temperature=args.temperature, seed=args.seed, 
+                    max_relation_examples=args.max_relation_examples, exclude_source=args.exclude_source)
     else:
         raise KeyError(f"{args.model} is not a valid model")
     assert model is not None, f"Couldn't assign a model named: {args.model}"
@@ -689,7 +811,7 @@ def run(args):
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
     parser.add_argument("--dataset_dir", type=str, default="KGs/Countries-S1", help="Path to dataset.")
-    parser.add_argument("--model", type=str, default="GCL", help="Model name to use for link prediction.", choices=["RALP",'GCL'])
+    parser.add_argument("--model", type=str, default="GCL", help="Model name to use for link prediction.", choices=["RALP", "GCL", "RCL"])
     parser.add_argument("--base_url", type=str, default="http://harebell.cs.upb.de:8501/v1",
                         choices=["http://harebell.cs.upb.de:8501/v1", "http://tentris-ml.cs.upb.de:8502/v1"],
                         help="Base URL for the OpenAI client.")
@@ -706,5 +828,7 @@ def run(args):
     parser.add_argument("--batch_size", type=int, default=1)
     parser.add_argument("--chunk_size", type=int, default=1)
     parser.add_argument("--seed", type=int, default=42)
-    parser.add_argument("--num_of_hops", type=int, default=1, help="Number of hops to use to extract a subgraph around an entity-")
+    parser.add_argument("--num_of_hops", type=int, default=1, help="Number of hops to use to extract a subgraph around an entity.")
+    parser.add_argument("--max_relation_examples", type=int, default=50, help="Maximum number of relation examples to include in RCL context.")
+    parser.add_argument("--exclude_source", default=True, help="Exclude triples with the same source entity in RCL context.")
     run(parser.parse_args())