|
5 | 5 | # SPDX-FileContributor: Michael Fritzsche |
6 | 6 |
|
7 | 7 |
|
| 8 | +import csv |
8 | 9 | from typing import Any, Callable, Union |
9 | 10 |
|
| 11 | +import requests |
| 12 | + |
10 | 13 | from hermes.commands.base import HermesCommand |
11 | 14 | from hermes.model.merge.action import Concat, MergeAction, MergeSet |
12 | 15 | from hermes.model.types import ld_dict |
13 | 16 | from hermes.model.types.ld_context import iri_map as iri |
14 | 17 | from .base import HermesProcessPlugin |
15 | 18 |
|
16 | 19 |
|
| 20 | +def match_equals(left: Any, right: Any) -> bool: |
| 21 | + """ |
| 22 | + Compares two objects with ==. |
| 23 | +
|
| 24 | + Args: |
| 25 | + left (Any): The first object for the comparison. |
| 26 | + right (Any): The second object for the comparison. |
| 27 | +
|
| 28 | + Returns: |
| 29 | + bool: The result of the comparison. |
| 30 | + """ |
| 31 | + return left == right |
| 32 | + |
| 33 | + |
17 | 34 | def match_keys(*keys: list[str], fall_back_to_equals: bool = False) -> Callable[[Any, Any], bool]: |
18 | 35 | """ |
19 | 36 | Creates a function taking to parameters that returns true |
@@ -140,18 +157,18 @@ def match_func(left: Any, right: Any) -> bool: |
140 | 157 | DEFAULT_MATCH = match_keys("@id", fall_back_to_equals=True) |
141 | 158 | """ Callable[[Any, Any], bool]: The default match function used for comparison. """ |
142 | 159 |
|
143 | | -MATCH_FUNCTION_FOR_TYPE = {"schema:Person": match_person} |
| 160 | +MATCH_FUNCTION_FOR_TYPE = {iri["schema:Person"]: match_person} |
144 | 161 | """ |
145 | 162 | dict[str, Callable[[Any, Any], bool]]: A dict containing for JSON_LD types the match function (not DEFAULT_MATCH). |
146 | 163 | """ |
147 | 164 |
|
148 | 165 | ACTIONS = { |
149 | 166 | "default": MergeSet(DEFAULT_MATCH), |
150 | 167 | "concat": Concat(), |
151 | | - "Person": MergeSet(MATCH_FUNCTION_FOR_TYPE["schema:Person"]), |
| 168 | + "Person": MergeSet(MATCH_FUNCTION_FOR_TYPE[iri["schema:Person"]]), |
152 | 169 | **{ |
153 | 170 | "Or".join(types): MergeSet(match_multiple_types( |
154 | | - *(("schema:" + type, MATCH_FUNCTION_FOR_TYPE.get("schema:" + type, DEFAULT_MATCH)) for type in types) |
| 171 | + *(("schema:" + type, MATCH_FUNCTION_FOR_TYPE.get(iri["schema:" + type], DEFAULT_MATCH)) for type in types) |
155 | 172 | )) |
156 | 173 | for types in [ |
157 | 174 | ("AboutPage", "CreativeWork"), |
@@ -844,7 +861,78 @@ def match_func(left: Any, right: Any) -> bool: |
844 | 861 |
|
845 | 862 | class CodemetaProcessPlugin(HermesProcessPlugin): |
846 | 863 | def __call__(self, command: HermesCommand) -> dict[Union[str, None], dict[Union[str, None], MergeAction]]: |
847 | | - strats = {**CODEMETA_STRATEGY} |
| 864 | + try: |
| 865 | + strats = CodemetaProcessPlugin.get_schema_strategies() |
| 866 | + strats.update(CodemetaProcessPlugin.get_codemeta_strategies()) |
| 867 | + strats[None] = {None: MergeSet(DEFAULT_MATCH)} |
| 868 | + except Exception: |
| 869 | + strats = {**CODEMETA_STRATEGY} |
848 | 870 | for key, value in PROV_STRATEGY.items(): |
849 | 871 | strats[key] = {**value, **strats.get(key, {})} |
850 | 872 | return strats |
| 873 | + |
| 874 | + @classmethod |
| 875 | + def get_schema_strategies(cls): |
| 876 | + # get a set of all types that have to be handled separately |
| 877 | + special_types = set(MATCH_FUNCTION_FOR_TYPE.keys()) |
| 878 | + |
| 879 | + # get and read csv file containing information on schema.org types |
| 880 | + # switch to schemaorg-current-https-types.csv on change of standard context in HERMES |
| 881 | + download = requests.get("https://schema.org/version/latest/schemaorg-current-http-types.csv") |
| 882 | + decoded_content = download.content.decode('utf-8') |
| 883 | + cr = csv.reader(decoded_content.splitlines(), delimiter=',') |
| 884 | + # remove the first line (headers) |
| 885 | + type_table = list(cr)[1:] |
| 886 | + # build list of all subtypes for every type |
| 887 | + subtypes_for_types = {} |
| 888 | + for type_row in type_table: |
| 889 | + if len(type_row[7]) == 0: |
| 890 | + # no (direct) subtype |
| 891 | + subtypes_for_types[type_row[0]] = set() |
| 892 | + else: |
| 893 | + # add direct subtypes |
| 894 | + subtypes_for_types[type_row[0]] = set(type_row[7].split(", ")) |
| 895 | + # only immediate subtypes have been recorded now, add sub...subtypes too |
| 896 | + for super_type in subtypes_for_types: |
| 897 | + for other_type in subtypes_for_types: |
| 898 | + if super_type in subtypes_for_types[other_type]: |
| 899 | + subtypes_for_types[other_type].update(subtypes_for_types[super_type]) |
| 900 | + |
| 901 | + # get and read csv file containing information on schema.org properties |
| 902 | + # switch to schemaorg-current-https-properties.csv on change of standard context in HERMES |
| 903 | + download = requests.get("https://schema.org/version/latest/schemaorg-current-http-properties.csv") |
| 904 | + decoded_content = download.content.decode('utf-8') |
| 905 | + cr = csv.reader(decoded_content.splitlines(), delimiter=',') |
| 906 | + # remove the first line (headers) |
| 907 | + property_table = list(cr)[1:] |
| 908 | + strategies = {} |
| 909 | + # add the strategies for all properties to all types they can occur in |
| 910 | + for property_row in property_table: |
| 911 | + # generate a set of all types this property can have values of |
| 912 | + shallow_range_types = set(property_row[7].split(", ")) if property_row[7] != "" else set() |
| 913 | + range_types = shallow_range_types.union( |
| 914 | + *(subtypes_for_types.get(range_type, set()) for range_type in shallow_range_types) |
| 915 | + ) |
| 916 | + # get all special types this property can have values of |
| 917 | + special_range_types = special_types.intersection(range_types) |
| 918 | + # if there is a special range type this property needs a special match function |
| 919 | + if len(special_range_types) != 0: |
| 920 | + # construct the match function |
| 921 | + match_function = MergeSet(match_multiple_types( |
| 922 | + *((range_type, MATCH_FUNCTION_FOR_TYPE[range_type]) for range_type in special_range_types), |
| 923 | + fall_back_function=DEFAULT_MATCH |
| 924 | + )) |
| 925 | + # iterate over a set of all types this property can occur in |
| 926 | + shallow_domain_types = set(property_row[6].split(", ")) if property_row[6] != "" else set() |
| 927 | + for domain_type in shallow_domain_types.union( |
| 928 | + *(subtypes_for_types.get(domain_type, set()) for domain_type in shallow_domain_types) |
| 929 | + ): |
| 930 | + # add the match function to the types match functions |
| 931 | + strategies.setdefault(domain_type, {})[property_row[0]] = match_function |
| 932 | + # return the strategies |
| 933 | + return strategies |
| 934 | + |
| 935 | + @classmethod |
| 936 | + def get_codemeta_strategies(cls): |
| 937 | + # FIXME: implement |
| 938 | + return {} |
0 commit comments