SSDpp-numeric/_classes.py at master · HMProenca/SSDpp-numeric · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
# -*- coding: utf-8 -*-

from time import time

from src._mdl_ssd import _fit_list
from src.model_predictions import predict_numeric,estimate_weigthedkullbackleibler_gaussian

class SSDC():
    """Subgroup list discovery for numeric targets with an MDL formulation.
    It resorts to greedy and beam search to find the the subgroup list that
    best fits the data
    Parameters
    ----------
    target_type : string, mandatory
        (possible values: "numeric" or "nominal")
        choose the appropriate target_type (no default value) for the type of
        rule/subgroup search.
    max_depth : int, optional (default=4)
        defines the maximum size that subgroup description can take based
        on the number of variables that the beam search accepts to refine.
        For example, if 'max_depth = 4' the maximum size of a pattern found is
        4.
    beam_width : int, optional (default=100)
        defines the width of the beam in the beam search, i.e., the number of
        patterns that are selected at each iteration to be expanded.
   iterative_beam_width
   n_cutpoints : int, optional (default=5)
       number of cut points used to discretize a numeric attribute/variable.
       Note 1: this algorithm creates for each cutpoint a binary split, and
       the combination of all cutpoints. As an example of the former, if the
       cut point is x_cut = 5, it will create both the condition x<5 and x>5.
       In relation to the latter, if two of the cut points are x_cut1=3, and
       x_cut2=5, it will also create  3<x<5.
   task : string (default = "discovery")
       (possible values: "discovery" or "prediction")
       - "discovery": performs subgroup discovery by assuming the last rule of
       the model as a constant rule and equal to the dataset distribution.
       - "prediction": finds a rule list for prediction by assuming that the
       last rule changes with other rules in the dataset.
   discretization : string (default="static")
       (possible values: "static" or "dynamic")
       - "static" - performs a priori discretization of all numeric variables
       - "dynamic" - at each iteration of the beam search it conditionally
       discretizes all numeric variables based on the given pattern.
   max_rules : int, optional (default=0)
       Maximum number of subgroups/rules to mine. If max_rules=0 is given it
       continues finding subgroups/rules until no more compression is achieved.
   gain : int, optional (default="normalized")
       (possible values: "absolute" or "normalized")
       Type of score used to expand the beam search and to add a rule/subgroup
       at each iteration.
       - "absolute" - adds the rule/subgroup at each iteration that maximizes
       the normalized gain, i.e., that difference between the length of the
       existing model minus the length of that model with the candidate
       subgroup added.
       - "normalized" - adds the rule/subgroup at each iteration that maximizes
       the "absolute" gain normalized by the number of instances covered
       (usage) by that rule/subgroup.
     Attributes
    ----------
    number_rules: int
        Number of rules of the list excluding the default rule.
    antecedent_description: list of strings
        String of each rule antecedent description.
    consequent_description: list of strings
        String of each rule consequents.
    """

    def __init__(self,target_type="numeric",max_depth=4, beam_width = 100,
                 iterative_beam_width=1,n_cutpoints = 5, task = "discovery",
                 discretization = "static",max_rules = 0,
                 gain = "normalized"):
        self.target_type = target_type
        self.gain = gain
        self.max_depth = max_depth
        self.beam_width = beam_width
        self.iterative_beam_width= iterative_beam_width
        self.n_cutpoints = n_cutpoints
        self.discretization = discretization
        self.task = task
        self.number_rules = 0
        self.max_rules = max_rules

    #TODO:  def __repr__
    def __str__(self):
        if self.number_rules == 0:
            text2print = "There are not rules"
        else:
            text2print = ""
            for nr,ant in enumerate(self.antecedent_description):
                if nr == 0:
                    text2print += "IF x in "
                else:
                    text2print += "ELSE IF x in "
                text2print += ant + " THEN " + self.consequent_description[nr] +\
                            " \n"
            text2print += "ELSE " + self.consequent_lastrule_description


        return text2print

    def fit(self,df):
        """Fit the model according to the given training data.
        Parameters
        ----------
        df : pandas dataframe with name variables with last column as target
        variable.
        Returns
        -------
        self : object
        """

        #        if self.min_support_class < self.min_support_global:
        #            warnings.warn("min_support_global > min_support_class => will not"+
        #                          +" be used",UserWarning)
        #
        #        if self.generate_method not in {"FPM","beam","dependence"}:
        #            raise ValueError("Generative method of itemsets shoud be in "\
        #                             "FPM or beam; got (C=%s)"
        #                             % self.generate_method)
        start_time = time()
        rulelist = _fit_list(
                df, self.target_type, self.max_depth,self.beam_width,
                self.iterative_beam_width, self.n_cutpoints,
                self.task,self.discretization,self.max_rules,self.gain)
        self.runtime = time() - start_time
        self.number_rules = rulelist.number_rules
        self.target_type_specific(rulelist)
        self.antecedent_raw = rulelist.antecedent_raw
        self.antecedent_description = rulelist.antecedent_description
        self.consequent_description = rulelist.consequent_description
        self.consequent_lastrule_description = rulelist.consequent_lastrule_description
        self.pattern4prediction = rulelist.pattern4prediction
        self.statistics = rulelist.statistic_rules
        self.default_statistic = rulelist.default_statistic
        self.rule_sets = [[ix for ix,x in enumerate(reversed(bin(bitset)[2:])) if x == '1']
                                                for bitset in rulelist.bitset_rules]
        self.length_model = rulelist.length_model
        self.length_data = rulelist.length_data
        self.length_final = self.length_model + self.length_data
        self.length_original = rulelist.length_original
        self.length_ratio = rulelist.length_ratio
        self.measures = rulelist.measures
        self.measures["runtime"] = self.runtime
        return self


    def predict(self,X):
        """ Predict for new data what is it going to be the performance
        if rule list not fit it does not work
        ----------
        X : a numpy array or pandas dataframe with the variables in the same
            poistion (column number) as given in "fit" function.

        Returns a numpy array y with the predicted values according to the
        fitted rule list (obtained using the "fit" function above). y has the
        same length as X.shape[0] (number of rows).
        -------
        self : object
        """
        y , usageperrule= predict_numeric(self, X)
        values = X.iloc[:,-1]
        return y,usageperrule

    def target_type_specific(self,rulelist):
        if self.target_type == "nominal":
            self.class_codes = rulelist.class_codes
            self.class_counts = rulelist.class_counts
            self.class_orig = rulelist.class_orig
            self.support_uncovered = rulelist.support_uncovered
            self.usage_rules = rulelist.usage_rules
            self.support_rules = rulelist.support_rules
        elif self.target_type == "numeric":
            self.statistic_rules = rulelist.statistic_rules
            self.default_statistic = rulelist.default_statistic
            self.support_covered = rulelist.support_uncovered
        return self

    def rulelist_description(self):
        text2add = ""
        for nr,ant in enumerate(self.antecedent_description):
            if nr == 0:
                text2add += "IF "
            else:
                text2add += "ELSE IF "
            text2add += ant + " THEN " + self.consequent_description[nr] +\
            " \n"