Refactored

Demirrr · Demirrr · commit 1ba5cd83d282 · 2025-03-31T15:12:08.000+02:00
diff --git a/retrieval_aug_predictors/__init__.py b/retrieval_aug_predictors/__init__.py
@@ -1,2 +1,2 @@
 from .abstract import AbstractBaseLinkPredictorClass
-from .models import RCL,RALP, GCL
+from .models import RCL,RALP, GCL, Demir
diff --git a/retrieval_aug_predictors/models.py b/retrieval_aug_predictors/models.py
@@ -7,6 +7,8 @@
 import json
 import re
 import igraph
+from typing import Tuple, Dict
+import dspy
 class PredictionItem(BaseModel):
     """Individual prediction item with entity name and confidence score."""
     entity: str = Field(..., description="Name of the predicted entity")
@@ -190,12 +192,9 @@ def __init__(self, knowledge_graph: KG = None,
                  llm_model="tentris",
                  temperature: float = 1, seed: int = 42) -> None:
         super().__init__(knowledge_graph, name)
-        # @TODO: CD: input arguments should be passed onto the abstract class
-
         self.client = OpenAI(base_url=base_url, api_key=api_key)
         self.llm_model = llm_model
         self.temperature = temperature
-        # @TODO:CD: Use the seed
         self.seed = seed
 
     def extract_float(self, text):
@@ -401,4 +400,102 @@ def forward_k_vs_all(self, x: torch.LongTensor) -> torch.FloatTensor:
                     print(f"For {h},{r}, {pred} not found\tPrediction Size: {len(prediction_response.predictions)}")
                     continue
             batch_output.append(scores_for_all_entities)
-        return torch.FloatTensor(batch_output)
+        return torch.FloatTensor(batch_output)
+
+# 1. Define the Signature
+class KGLikelihood(dspy.Signature):
+    """Assess the likelihood that a triple (subject, predicate, candidate_object) is true,
+    given some context triples. Output a score between 0.0 and 1.0."""
+
+    context = dspy.InputField(desc="Known knowledge graph triples.")
+    subject = dspy.InputField(desc="The subject entity.")
+    predicate = dspy.InputField(desc="The relationship type.")
+    candidate_object = dspy.InputField(desc="The candidate object entity to score.")
+
+    score = dspy.OutputField(desc="A likelihood score between 0.0 and 1.0.")
+
+
+class MultiLabelLinkPredictionWithScores(dspy.Signature):
+    """Given a subject entity and a predicate (relation), predict a list of
+    object entities that satisfy the relation, along with a likelihood score for each.
+    Use the provided examples as a guide.
+    Output a JSON formatted list of objects, where each object has an 'entity' (string)
+    and a 'score' (float between 0.0 and 1.0) key."""
+
+    examples = dspy.InputField(
+        desc="Few-shot examples of (subject, predicate) -> [{'entity': entity1, 'score': score1}, ...].")
+    subject = dspy.InputField(desc="The subject entity.")
+    predicate = dspy.InputField(desc="The relationship type.")
+
+    # Updated OutputField requesting JSON
+    objects_with_scores = dspy.OutputField(
+        desc="A JSON string representing a list of objects. "
+             "Each object in the list should be a dictionary with 'entity' (string) and 'score' (float, 0.0-1.0) keys.")
+
+class MultiLabelLinkPredictor(dspy.Module):
+    def __init__(self):
+        super().__init__()
+        self.predictor = dspy.Predict(MultiLabelLinkPredictionWithScores)
+    def forward(self, subject, predicate, few_shot_examples)->List[Tuple[str, float]]:
+        example_str = ""
+        for (s, p), o_list in few_shot_examples.items():
+            example_str += f"({s}, {p})\n{', '.join(o_list)}\n---\n"
+        # @TODO: CD: Also keep track of LLM cost
+        dspy_pred:dspy.primitives.prediction.Prediction=self.predictor(examples=example_str, subject=subject, predicate=predicate)
+        return [ (i["entity"],i["score"])for i in json.loads(dspy_pred.objects_with_scores)]
+
+class Demir(AbstractBaseLinkPredictorClass):
+    def forward_triples(self, x: torch.LongTensor) -> torch.FloatTensor:
+        raise NotImplementedError("RCL needs to implement it")
+    def forward_k_vs_all(self, x: torch.LongTensor) -> torch.FloatTensor:
+        batch_predictions=[]
+        for hr in x.tolist():
+            idx_h, idx_r = hr
+            h, r = self.idx_to_entity[idx_h], self.idx_to_relation[idx_r]
+            predictions = self.scoring_func.forward(
+                subject=h,
+                predicate=r,
+                few_shot_examples=self.entity_relation_to_entities)
+            scores=[-100]*len(self.idx_to_entity)
+            for entity,score in predictions:
+                try:
+                    idx_entity=self.entity_to_idx[entity]
+                except KeyError:
+                    print(f"Entity:{entity} not found")
+                    continue
+                scores[idx_entity]=score
+            batch_predictions.append(scores)
+        return torch.FloatTensor(batch_predictions)
+
+    def __init__(self,knowledge_graph, base_url,api_key,temperature, seed,llm_model,use_val:bool=False):
+        super().__init__(knowledge_graph,name="Demir")
+        self.client = OpenAI(base_url=base_url, api_key=api_key)
+        self.temperature = temperature
+        self.seed = seed
+
+        self.lm = dspy.LM(model=f"openai/{llm_model}", api_key=api_key,
+                          api_base=base_url,
+                          seed=seed,
+                          temperature=temperature,
+                          cache=True,cache_in_memory=True,
+                          kwargs={"extra_body":{"truncate_prompt_tokens": 32_000}})
+        dspy.configure(lm=self.lm)
+        self.train_set: List[Tuple[str]] = [(self.idx_to_entity[idx_h],
+                                             self.idx_to_relation[idx_r],
+                                             self.idx_to_entity[idx_t]) for idx_h, idx_r, idx_t in
+                                            self.kg.train_set.tolist()]
+        # Validation dataset
+        self.val_set: List[Tuple[str]] = [(self.idx_to_entity[idx_h],
+                                           self.idx_to_relation[idx_r],
+                                           self.idx_to_entity[idx_t]) for idx_h, idx_r, idx_t in
+                                          self.kg.valid_set.tolist()]
+        self.triples = self.train_set + self.val_set if use_val else self.train_set
+
+        self.entity_relation_to_entities=dict()
+        from collections import OrderedDict
+        for s,p,o in self.triples:
+            self.entity_relation_to_entities.setdefault((s,p),[]).append(o)
+
+        # 4. Instantiate your predictor
+        self.scoring_func = MultiLabelLinkPredictor()
+        self.entities:List[str]=list(sorted(self.entity_to_idx.keys()))
diff --git a/retrieval_augmented_link_predictor.py b/retrieval_augmented_link_predictor.py
@@ -37,7 +37,7 @@
 import numpy as np
 from typing import List, Optional
 from dotenv import load_dotenv
-from retrieval_aug_predictors import AbstractBaseLinkPredictorClass, RALP, GCL, RCL
+from retrieval_aug_predictors import AbstractBaseLinkPredictorClass, RALP, GCL, RCL, Demir
 
 load_dotenv()
 
@@ -63,6 +63,10 @@ def get_model(args,kg)->AbstractBaseLinkPredictorClass:
         model = RCL(knowledge_graph=kg, base_url=args.base_url, api_key=args.api_key,
                     llm_model=args.llm_model_name, temperature=args.temperature, seed=args.seed, 
                     max_relation_examples=args.max_relation_examples, exclude_source=args.exclude_source)
+    elif args.model == "Demir":
+        model = Demir(knowledge_graph=kg, base_url=args.base_url, api_key=args.api_key,
+                     llm_model=args.llm_model_name, temperature=args.temperature, seed=args.seed)
+
     else:
         raise KeyError(f"{args.model} is not a valid model")
     assert model is not None, f"Couldn't assign a model named: {args.model}"
@@ -85,7 +89,7 @@ def run(args):
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
     parser.add_argument("--dataset_dir", type=str, default="KGs/Countries-S1", help="Path to dataset.")
-    parser.add_argument("--model", type=str, default="GCL", help="Model name to use for link prediction.", choices=["RALP", "GCL", "RCL"])
+    parser.add_argument("--model", type=str, default="Demir", help="Model name to use for link prediction.", choices=["Demir", "GCL", "RCL","RALP"])
     parser.add_argument("--base_url", type=str, default="http://harebell.cs.upb.de:8501/v1",
                         choices=["http://harebell.cs.upb.de:8501/v1", "http://tentris-ml.cs.upb.de:8502/v1"],
                         help="Base URL for the OpenAI client.")

Original file line number	Diff line number	Diff line change
`@@ -1,2 +1,2 @@`
`1`	`1`	`from .abstract import AbstractBaseLinkPredictorClass`
`2`		`-from .models import RCL,RALP, GCL`
	`2`	`+from .models import RCL,RALP, GCL, Demir`