Skip to content

Backend documentation

Preprocessing module

this module is in charge of
  • supporting event log imports from xes/csv files.
  • formatting the event log so that it can be later on used by the nn_manager module. in particular, the timestamps are encoded as integers, the case id's and activity names are encoded, and the rows are sorted by case id and timestamp. Splitting the event log in training and testing sublogs is also supported.
  • the preprocessor also calculates important values such as the number of activities and absolute frequency distribution, which are also required by the neural network's training.
  • formatting is done automatically after importing, but this can also be deselected by setting the corresponding parameter.
  • other preprocessing operations are supported, such as replacing NaN values, adding a unique start / end activity to the log, and removing duplicate rows.

Note that this module does not bring the event log in the input format for the RNN. this is done by the module util.py in the subpackage RMTPP_torch.

Preprocessing

This is the preprocessing unit for our server, which implements all the above mentioned functionalities.

Source code in server/preprocessing.py
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
class Preprocessing: 
    """
    This is the preprocessing unit for our server, which implements all the above mentioned functionalities.
    """
    def __init__(self):
        self.time_precision = None

        #: contains the event log path
        self.event_log_path = None
        self.event_log= None
        self.case_id_key = None 
        self.case_activity_key = None 
        self.case_timestamp_key = None 

        self.event_df = None #: dataframe containing the event log. corresponds to the imported log and eventually also the formatted one
        self.number_classes=0
        self.absolute_frequency_distribution = None
        self.case_id_le = None
        self.activity_le = None
        self.exponent = None
        self.unencoded_df = None



    def xes_helper(self, path): 
        """just a testing function"""
        log =pm4py.read.read_xes(path)
        dataframe = pm4py.convert_to_dataframe(log)
        print("done loading")
        print(dataframe.columns)


    def handle_import(self,is_xes, path, case_id, timestamp, activity,time_precision = time_precision.TimePrecision.NS,  sep = ",", formatting = True):
        """
        handles the import of the event log. 

        Args:
            is_xes (bool): If True, the event log is in XES format. If False, it is in CSV format.
            path (str): Path to the event log.
            case_id (str): Case id column name.
            timestamp (str): Timestamp column name.
            activity (str): Activity column name.
            time_precision (TimePrecision, optional): Time precision. Defaults to TimePrecision.NS. note that this functionality is INCOMPLETED. 
            sep (str, optional): Separator. Defaults to ",".
            formatting (bool, optional): If True, the event log is formatted so that it can be used by the RNN. Defaults to True. 
        """
        self.time_precision = time_precision
        self.case_id_key =  case_id
        self.case_activity_key =activity 
        self.case_timestamp_key =timestamp 
        if is_xes: 
            self.import_event_log_xes(path, formatting)
        else: 
            self.import_event_log_csv(path, sep, formatting)


    def import_event_log_xes(self, path, formatting=True):
        """
        Imports an event log in XES format.

        Args:
        path (str): Path to the XES file.
        formatting (bool, optional): If True, the event log is formatted so that it can be used by the RNN. Defaults to True.

        Effects:
        - event_df dataframe is generated.
        - The generated dataframe has 3 columns: case id (string), label (string), and timestamp (datetime64).
        - event log object: its correctness is assumed from the pm4py library and is therefore not tested.
        """
        self.event_df = pm4py.read.read_xes(path)
        self.event_df = pm4py.convert_to_dataframe(self.event_df)
        self.import_event_log(formatting)


    def import_event_log_csv(self, path, sep, formatting = True): 
        """
        This is an adapter for format_dataframe such that the event data can be properly used by the RNN.

        Args:
            path (str): Path to the event log.
            sep (str): Separator.
            formatting (bool, optional): If True, the event log is formatted so that it can be used by the RNN. Defaults to True.
        """
        self.event_df= pd.read_csv(path, sep=sep)
        self.import_event_log(formatting)


    def import_event_log_dataframe(self,df, case_id, activity_key, timestamp_key, formatting = True):
        """
        This is an adapter for format_dataframe such that the event data can be properly used by the RNN model.

        Args:
            path (str): Path to the event log.
            case_id (str): Case id column name.
            activity_key (str): Activity column name.
            timestamp_key (str): Timestamp column name.
            formatting (bool, optional): If True, the event log is formatted so that it can be used by the RNN. Defaults to True.
        """
        self.event_df = df
        self.case_id_key =  case_id
        self.case_activity_key =activity_key
        self.case_timestamp_key =timestamp_key
        self.import_event_log(formatting)


    def import_event_log(self, formatting):
        """
        helper function for import_event_log_csv and import_event_log_xes. 
        - genereates an EventLog object so that other pm4py functions can use it
        - remove all columns other than the three main ones
        - remove all NaN entries
        - format a dataframe using pm4py 
        Effects: 
        - rows sorted by case id and timestamp

        Args:
            formatting (bool): If True, the event log is formatted so that it can be used by the RNN.
        """
        #: returns a formated dataframe that can work with other pm4py functions
        self.event_df = pm4py.format_dataframe(self.event_df, 
                                           case_id=self.case_id_key,
                                             activity_key=self.case_activity_key,
                                             timestamp_key=self.case_timestamp_key) #returns formated df.

        #: convert_to_event_log requires string format for case_id and marker
        self.event_df[self.case_id_key] = self.event_df[self.case_id_key].astype("string")
        self.event_df[self.case_activity_key] = self.event_df[self.case_activity_key].astype("string")



        self.event_df[self.case_timestamp_key] = self.event_df[self.case_timestamp_key].astype("datetime64[ns, UTC]")




        self.event_log = pm4py.convert_to_event_log(self.event_df, self.case_id_key) #this returns an event log



        #: filter out all the other generated columns
        self.event_df= self.event_df[[self.case_id_key, self.case_activity_key, self.case_timestamp_key]]


        #: the rest should only be executed when training
        if not formatting:
            return


        self.event_df= self.event_df.dropna()
        #: used for conformance checking, save everything except the 
        # extra columns 
        self.unencoded_df = self.event_df.copy(deep = True)


        #: sort the rows by group id and timestamp key
        self.event_df =  self.event_df.sort_values(by=[self.case_id_key, self.case_timestamp_key])




        self.encode_df_columns()



    def string_to_index(self , df, column):
        """
        translate each marker into a specific integer index.  
        """
        col = df[column].tolist()
        uniques = set(col)
        enume = [(label, index) for index, label in enumerate(uniques)]
        return dict(enume)


    def encode_df_columns(self):
        """
        - encode the markers and case id's with integers (label encoding)
        - encode the timestamps
        - returns nothing, but modifies self.event_df

        The following holds for `self.event_df` after this function is called:
            - all columns are sorted by case id and timestamp
            - the case id and markers are encoded with integers
            - the timestamps are encoded as floats. timezone information is removed. 
        """
        #: we encode the markers with integers (label encoding) to be consistent with the authors implementation

        #: only initialize the encoders, if they dont exist yet



        if self.activity_le == None: 
            self.activity_le = LabelEncoder()
            self.event_df[self.case_activity_key] = self.activity_le.fit_transform(self.event_df[self.case_activity_key])
        else: 
            #: this will not overwrite the existing le
            self.event_df[self.case_activity_key] = self.activity_le.transform(self.event_df[self.case_activity_key])

        if self.case_id_le == None:
            self.case_id_le = LabelEncoder()
            self.event_df[self.case_id_key] = self.case_id_le.fit_transform(self.event_df[self.case_id_key])
            #: the other case is not necessary. the language encoders are only passed
            # to the preprocessor when doing predictions (not when creating a predictive log)
            # the case id is not used for making predicitons; and also does not make sense for
            # ongoing cases (in terms of using past behaviour to predict future behaviour)


        #: get the number of classes
        self.number_classes = len(self.event_df[self.case_activity_key].unique()) 
        #: trasnform back into strings, its necessary for pm4py
        self.event_df[self.case_activity_key] =self.event_df[self.case_activity_key].astype("str")
        self.event_df[self.case_id_key] =self.event_df[self.case_id_key].astype("str")

        #: compute abs. freq. distribution for the activities. its necessary for CrossEntropyLoss
        self.absolute_frequency_distribution= Counter(self.event_df[self.case_activity_key].to_list())

        # remove timezone information
        self.event_df[self.case_timestamp_key] = self.event_df[self.case_timestamp_key].dt.tz_localize(None)

        #: here we convert the datetime64 (ISO standard) into an integer in POSIX standard. the authors
        # use an Excel format, but we decide to use integers for simplicity.
        self.event_df[self.case_timestamp_key] = self.event_df[self.case_timestamp_key].astype(int)


        if self.time_precision == time_precision.TimePrecision.NS: 

            #: nanoseconds can cause numerical instability. therefore we make the number smaller by shifting the comma by `exponent`
            self.exponent = self.event_df[self.case_timestamp_key].astype(str).apply(lambda x: len(x)).mean()
            self.event_df[self.case_timestamp_key] = self.event_df[self.case_timestamp_key] / (10 ** self.exponent)
            #: note that other time precisions are not supported yet. TODO

        # #: transform the case id and markers back into float
        self.event_df[self.case_activity_key] = self.event_df[self.case_activity_key].astype("float64")
        self.event_df[self.case_id_key] = self.event_df[self.case_id_key].astype("float64")

    def split_train_test(self, train_percentage):
        """
        This is a helper function for splitting the event log into training and testing data.

        Args:
            train_percentage (float): The percentage of data to be used for training.

        Returns:
            tuple: A tuple containing two event logs (dataframes) for training and testing, the number of classes (for the markers), and the absolute frequency distribution for each class in the whole event log.
        """
        if train_percentage>=1 or train_percentage<=0: 
            raise exceptions.TrainPercentageTooHigh()

        cases = self.event_df[self.case_id_key].unique().tolist()
        train_cases = set()
        test_cases = set()
        for c in cases:
            r = random.random()
            if r <= train_percentage:
                train_cases.add(c)
            else:
                test_cases.add(c)
        train = self.event_df[self.event_df[self.case_id_key].isin(train_cases)]
        test = self.event_df[self.event_df[self.case_id_key].isin(test_cases)]

        if test.shape[0] == 0: 
            raise exceptions.TrainPercentageTooHigh()


        return train, test



    def find_start_activities(self):
        """
        find the start activities of all cases for an existing log and return a dict with start activities as keys and value is the count of this activity
        """
        start_activities = pm4py.stats.get_start_activities(self.event_log, activity_key=self.case_activity_key, case_id_key=self.case_id_key, timestamp_key=self.case_timestamp_key)
        return start_activities

    def find_end_activities(self):
        """"
        find the end activities of all cases for an existing log and return a dict with end activities as keys and value is the count of this activity
        """
        end_activities = pm4py.get_end_activities(self.event_log, activity_key=self.case_activity_key, case_id_key=self.case_id_key, timestamp_key=self.case_timestamp_key)
        activities = self.event_df[self.case_activity_key].unique()
        end_activity_lookup = {activity:False  for activity in activities}
        for activity in activities:
            end_activity_lookup[activity]= True
        return end_activity_lookup 


    def get_sample_case(self):
        """
        returns a sample of a case
        """
        sampled_dataframe = pm4py.sample_cases(self.event_log, 1, case_id_key=self.case_id_key)
        return sampled_dataframe



    def replace_activity_nan_with_mode(self): 
        """
        replaces NaN values in activity column with median
        """

        mode=  self.event_df[self.case_activity_key].mode()
        self.event_df[self.case_activity_key].fillna(mode, inplace = True)

        return True


    def remove_duplicate_rows(self): 
        #: removes the duplicates ie the rows where the same activity happened at the same time in the same case id.
        # since we are dropping all the other columns, these duplicates make no sense.
        self.event_df = self.event_df.drop_duplicates(subset=[self.case_id_key, self.case_activity_key, self.case_timestamp_key])
        return True


    def add_unique_start_end_activity(self):
        """
        if there is no unique start/ end activity, add an artificial start and end activity
        """
        if (len(self.find_start_activities()) != 1) or (len(self.find_end_activities()) != 1):
            processed_log= pm4py.insert_artificial_start_end(
                self.event_log, 
                activity_key=self.case_activity_key, 
                case_id_key=self.case_id_key, 
                timestamp_key=self.case_timestamp_key
            )
            self.event_df =pm4py.convert_to_dataframe(processed_log) 
            return True
        return False

add_unique_start_end_activity()

if there is no unique start/ end activity, add an artificial start and end activity

Source code in server/preprocessing.py
351
352
353
354
355
356
357
358
359
360
361
362
363
364
def add_unique_start_end_activity(self):
    """
    if there is no unique start/ end activity, add an artificial start and end activity
    """
    if (len(self.find_start_activities()) != 1) or (len(self.find_end_activities()) != 1):
        processed_log= pm4py.insert_artificial_start_end(
            self.event_log, 
            activity_key=self.case_activity_key, 
            case_id_key=self.case_id_key, 
            timestamp_key=self.case_timestamp_key
        )
        self.event_df =pm4py.convert_to_dataframe(processed_log) 
        return True
    return False

encode_df_columns()

  • encode the markers and case id's with integers (label encoding)
  • encode the timestamps
  • returns nothing, but modifies self.event_df

The following holds for self.event_df after this function is called: - all columns are sorted by case id and timestamp - the case id and markers are encoded with integers - the timestamps are encoded as floats. timezone information is removed.

Source code in server/preprocessing.py
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
def encode_df_columns(self):
    """
    - encode the markers and case id's with integers (label encoding)
    - encode the timestamps
    - returns nothing, but modifies self.event_df

    The following holds for `self.event_df` after this function is called:
        - all columns are sorted by case id and timestamp
        - the case id and markers are encoded with integers
        - the timestamps are encoded as floats. timezone information is removed. 
    """
    #: we encode the markers with integers (label encoding) to be consistent with the authors implementation

    #: only initialize the encoders, if they dont exist yet



    if self.activity_le == None: 
        self.activity_le = LabelEncoder()
        self.event_df[self.case_activity_key] = self.activity_le.fit_transform(self.event_df[self.case_activity_key])
    else: 
        #: this will not overwrite the existing le
        self.event_df[self.case_activity_key] = self.activity_le.transform(self.event_df[self.case_activity_key])

    if self.case_id_le == None:
        self.case_id_le = LabelEncoder()
        self.event_df[self.case_id_key] = self.case_id_le.fit_transform(self.event_df[self.case_id_key])
        #: the other case is not necessary. the language encoders are only passed
        # to the preprocessor when doing predictions (not when creating a predictive log)
        # the case id is not used for making predicitons; and also does not make sense for
        # ongoing cases (in terms of using past behaviour to predict future behaviour)


    #: get the number of classes
    self.number_classes = len(self.event_df[self.case_activity_key].unique()) 
    #: trasnform back into strings, its necessary for pm4py
    self.event_df[self.case_activity_key] =self.event_df[self.case_activity_key].astype("str")
    self.event_df[self.case_id_key] =self.event_df[self.case_id_key].astype("str")

    #: compute abs. freq. distribution for the activities. its necessary for CrossEntropyLoss
    self.absolute_frequency_distribution= Counter(self.event_df[self.case_activity_key].to_list())

    # remove timezone information
    self.event_df[self.case_timestamp_key] = self.event_df[self.case_timestamp_key].dt.tz_localize(None)

    #: here we convert the datetime64 (ISO standard) into an integer in POSIX standard. the authors
    # use an Excel format, but we decide to use integers for simplicity.
    self.event_df[self.case_timestamp_key] = self.event_df[self.case_timestamp_key].astype(int)


    if self.time_precision == time_precision.TimePrecision.NS: 

        #: nanoseconds can cause numerical instability. therefore we make the number smaller by shifting the comma by `exponent`
        self.exponent = self.event_df[self.case_timestamp_key].astype(str).apply(lambda x: len(x)).mean()
        self.event_df[self.case_timestamp_key] = self.event_df[self.case_timestamp_key] / (10 ** self.exponent)
        #: note that other time precisions are not supported yet. TODO

    # #: transform the case id and markers back into float
    self.event_df[self.case_activity_key] = self.event_df[self.case_activity_key].astype("float64")
    self.event_df[self.case_id_key] = self.event_df[self.case_id_key].astype("float64")

find_end_activities()

" find the end activities of all cases for an existing log and return a dict with end activities as keys and value is the count of this activity

Source code in server/preprocessing.py
312
313
314
315
316
317
318
319
320
321
def find_end_activities(self):
    """"
    find the end activities of all cases for an existing log and return a dict with end activities as keys and value is the count of this activity
    """
    end_activities = pm4py.get_end_activities(self.event_log, activity_key=self.case_activity_key, case_id_key=self.case_id_key, timestamp_key=self.case_timestamp_key)
    activities = self.event_df[self.case_activity_key].unique()
    end_activity_lookup = {activity:False  for activity in activities}
    for activity in activities:
        end_activity_lookup[activity]= True
    return end_activity_lookup 

find_start_activities()

find the start activities of all cases for an existing log and return a dict with start activities as keys and value is the count of this activity

Source code in server/preprocessing.py
305
306
307
308
309
310
def find_start_activities(self):
    """
    find the start activities of all cases for an existing log and return a dict with start activities as keys and value is the count of this activity
    """
    start_activities = pm4py.stats.get_start_activities(self.event_log, activity_key=self.case_activity_key, case_id_key=self.case_id_key, timestamp_key=self.case_timestamp_key)
    return start_activities

get_sample_case()

returns a sample of a case

Source code in server/preprocessing.py
324
325
326
327
328
329
def get_sample_case(self):
    """
    returns a sample of a case
    """
    sampled_dataframe = pm4py.sample_cases(self.event_log, 1, case_id_key=self.case_id_key)
    return sampled_dataframe

handle_import(is_xes, path, case_id, timestamp, activity, time_precision=time_precision.TimePrecision.NS, sep=',', formatting=True)

handles the import of the event log.

Parameters:

Name Type Description Default
is_xes bool

If True, the event log is in XES format. If False, it is in CSV format.

required
path str

Path to the event log.

required
case_id str

Case id column name.

required
timestamp str

Timestamp column name.

required
activity str

Activity column name.

required
time_precision TimePrecision

Time precision. Defaults to TimePrecision.NS. note that this functionality is INCOMPLETED.

NS
sep str

Separator. Defaults to ",".

','
formatting bool

If True, the event log is formatted so that it can be used by the RNN. Defaults to True.

True
Source code in server/preprocessing.py
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
def handle_import(self,is_xes, path, case_id, timestamp, activity,time_precision = time_precision.TimePrecision.NS,  sep = ",", formatting = True):
    """
    handles the import of the event log. 

    Args:
        is_xes (bool): If True, the event log is in XES format. If False, it is in CSV format.
        path (str): Path to the event log.
        case_id (str): Case id column name.
        timestamp (str): Timestamp column name.
        activity (str): Activity column name.
        time_precision (TimePrecision, optional): Time precision. Defaults to TimePrecision.NS. note that this functionality is INCOMPLETED. 
        sep (str, optional): Separator. Defaults to ",".
        formatting (bool, optional): If True, the event log is formatted so that it can be used by the RNN. Defaults to True. 
    """
    self.time_precision = time_precision
    self.case_id_key =  case_id
    self.case_activity_key =activity 
    self.case_timestamp_key =timestamp 
    if is_xes: 
        self.import_event_log_xes(path, formatting)
    else: 
        self.import_event_log_csv(path, sep, formatting)

import_event_log(formatting)

helper function for import_event_log_csv and import_event_log_xes. - genereates an EventLog object so that other pm4py functions can use it - remove all columns other than the three main ones - remove all NaN entries - format a dataframe using pm4py Effects: - rows sorted by case id and timestamp

Parameters:

Name Type Description Default
formatting bool

If True, the event log is formatted so that it can be used by the RNN.

required
Source code in server/preprocessing.py
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
def import_event_log(self, formatting):
    """
    helper function for import_event_log_csv and import_event_log_xes. 
    - genereates an EventLog object so that other pm4py functions can use it
    - remove all columns other than the three main ones
    - remove all NaN entries
    - format a dataframe using pm4py 
    Effects: 
    - rows sorted by case id and timestamp

    Args:
        formatting (bool): If True, the event log is formatted so that it can be used by the RNN.
    """
    #: returns a formated dataframe that can work with other pm4py functions
    self.event_df = pm4py.format_dataframe(self.event_df, 
                                       case_id=self.case_id_key,
                                         activity_key=self.case_activity_key,
                                         timestamp_key=self.case_timestamp_key) #returns formated df.

    #: convert_to_event_log requires string format for case_id and marker
    self.event_df[self.case_id_key] = self.event_df[self.case_id_key].astype("string")
    self.event_df[self.case_activity_key] = self.event_df[self.case_activity_key].astype("string")



    self.event_df[self.case_timestamp_key] = self.event_df[self.case_timestamp_key].astype("datetime64[ns, UTC]")




    self.event_log = pm4py.convert_to_event_log(self.event_df, self.case_id_key) #this returns an event log



    #: filter out all the other generated columns
    self.event_df= self.event_df[[self.case_id_key, self.case_activity_key, self.case_timestamp_key]]


    #: the rest should only be executed when training
    if not formatting:
        return


    self.event_df= self.event_df.dropna()
    #: used for conformance checking, save everything except the 
    # extra columns 
    self.unencoded_df = self.event_df.copy(deep = True)


    #: sort the rows by group id and timestamp key
    self.event_df =  self.event_df.sort_values(by=[self.case_id_key, self.case_timestamp_key])




    self.encode_df_columns()

import_event_log_csv(path, sep, formatting=True)

This is an adapter for format_dataframe such that the event data can be properly used by the RNN.

Parameters:

Name Type Description Default
path str

Path to the event log.

required
sep str

Separator.

required
formatting bool

If True, the event log is formatted so that it can be used by the RNN. Defaults to True.

True
Source code in server/preprocessing.py
111
112
113
114
115
116
117
118
119
120
121
def import_event_log_csv(self, path, sep, formatting = True): 
    """
    This is an adapter for format_dataframe such that the event data can be properly used by the RNN.

    Args:
        path (str): Path to the event log.
        sep (str): Separator.
        formatting (bool, optional): If True, the event log is formatted so that it can be used by the RNN. Defaults to True.
    """
    self.event_df= pd.read_csv(path, sep=sep)
    self.import_event_log(formatting)

import_event_log_dataframe(df, case_id, activity_key, timestamp_key, formatting=True)

This is an adapter for format_dataframe such that the event data can be properly used by the RNN model.

Parameters:

Name Type Description Default
path str

Path to the event log.

required
case_id str

Case id column name.

required
activity_key str

Activity column name.

required
timestamp_key str

Timestamp column name.

required
formatting bool

If True, the event log is formatted so that it can be used by the RNN. Defaults to True.

True
Source code in server/preprocessing.py
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
def import_event_log_dataframe(self,df, case_id, activity_key, timestamp_key, formatting = True):
    """
    This is an adapter for format_dataframe such that the event data can be properly used by the RNN model.

    Args:
        path (str): Path to the event log.
        case_id (str): Case id column name.
        activity_key (str): Activity column name.
        timestamp_key (str): Timestamp column name.
        formatting (bool, optional): If True, the event log is formatted so that it can be used by the RNN. Defaults to True.
    """
    self.event_df = df
    self.case_id_key =  case_id
    self.case_activity_key =activity_key
    self.case_timestamp_key =timestamp_key
    self.import_event_log(formatting)

import_event_log_xes(path, formatting=True)

Imports an event log in XES format.

Args: path (str): Path to the XES file. formatting (bool, optional): If True, the event log is formatted so that it can be used by the RNN. Defaults to True.

Effects: - event_df dataframe is generated. - The generated dataframe has 3 columns: case id (string), label (string), and timestamp (datetime64). - event log object: its correctness is assumed from the pm4py library and is therefore not tested.

Source code in server/preprocessing.py
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
def import_event_log_xes(self, path, formatting=True):
    """
    Imports an event log in XES format.

    Args:
    path (str): Path to the XES file.
    formatting (bool, optional): If True, the event log is formatted so that it can be used by the RNN. Defaults to True.

    Effects:
    - event_df dataframe is generated.
    - The generated dataframe has 3 columns: case id (string), label (string), and timestamp (datetime64).
    - event log object: its correctness is assumed from the pm4py library and is therefore not tested.
    """
    self.event_df = pm4py.read.read_xes(path)
    self.event_df = pm4py.convert_to_dataframe(self.event_df)
    self.import_event_log(formatting)

replace_activity_nan_with_mode()

replaces NaN values in activity column with median

Source code in server/preprocessing.py
333
334
335
336
337
338
339
340
341
def replace_activity_nan_with_mode(self): 
    """
    replaces NaN values in activity column with median
    """

    mode=  self.event_df[self.case_activity_key].mode()
    self.event_df[self.case_activity_key].fillna(mode, inplace = True)

    return True

split_train_test(train_percentage)

This is a helper function for splitting the event log into training and testing data.

Parameters:

Name Type Description Default
train_percentage float

The percentage of data to be used for training.

required

Returns:

Name Type Description
tuple

A tuple containing two event logs (dataframes) for training and testing, the number of classes (for the markers), and the absolute frequency distribution for each class in the whole event log.

Source code in server/preprocessing.py
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
def split_train_test(self, train_percentage):
    """
    This is a helper function for splitting the event log into training and testing data.

    Args:
        train_percentage (float): The percentage of data to be used for training.

    Returns:
        tuple: A tuple containing two event logs (dataframes) for training and testing, the number of classes (for the markers), and the absolute frequency distribution for each class in the whole event log.
    """
    if train_percentage>=1 or train_percentage<=0: 
        raise exceptions.TrainPercentageTooHigh()

    cases = self.event_df[self.case_id_key].unique().tolist()
    train_cases = set()
    test_cases = set()
    for c in cases:
        r = random.random()
        if r <= train_percentage:
            train_cases.add(c)
        else:
            test_cases.add(c)
    train = self.event_df[self.event_df[self.case_id_key].isin(train_cases)]
    test = self.event_df[self.event_df[self.case_id_key].isin(test_cases)]

    if test.shape[0] == 0: 
        raise exceptions.TrainPercentageTooHigh()


    return train, test

string_to_index(df, column)

translate each marker into a specific integer index.

Source code in server/preprocessing.py
201
202
203
204
205
206
207
208
def string_to_index(self , df, column):
    """
    translate each marker into a specific integer index.  
    """
    col = df[column].tolist()
    uniques = set(col)
    enume = [(label, index) for index, label in enumerate(uniques)]
    return dict(enume)

xes_helper(path)

just a testing function

Source code in server/preprocessing.py
61
62
63
64
65
66
def xes_helper(self, path): 
    """just a testing function"""
    log =pm4py.read.read_xes(path)
    dataframe = pm4py.convert_to_dataframe(log)
    print("done loading")
    print(dataframe.columns)

NN management module

This module is in charge of training the NN Model and also testing it.

The generated model may be exported and also imported later on by this class.

it supports manual trainig, random search and grid search.

Config

class containing the configuration for the model.

Attributes:

Name Type Description
seq_len int

The sequence length used for the sliding window.

emb_dim int

The embedding dimension.

hid_dim int

The hidden dimension.

mlp_dim int

The MLP dimension used for the LSTM.

batch_size int

The batch size.

alpha float

The alpha value.

dropout float

The dropout value.

time_precision TimePrecision

The time precision. Only NS is supported.

lr float

The learning rate.

epochs int

The number of epochs.

importance_weight str

The importance weight. (set to a default value as in the RMTPP implementation)

verbose_step int

The verbose step, just for logging purposes.

cuda bool

Whether to use the GPU.

absolute_frequency_distribution Counter

The absolute frequency distribution of the classes.

case_id_le LabelEncoder

The case ID label encoder.

activity_le LabelEncoder

The activity label encoder.

exponent int

The exponent used for the time conversion (see the preprocessing module).

number_classes int

The number of possible activities in the data.

case_activity_key str

The case activity key.

case_timestamp_key str

The case timestamp key.

case_id_key str

The case ID key.

Source code in server/nn_manager.py
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
class Config: 
    """
    class containing the configuration for the model. 

    Attributes:
        seq_len (int): The sequence length used for the sliding window. 
        emb_dim (int): The embedding dimension.
        hid_dim (int): The hidden dimension.
        mlp_dim (int): The MLP dimension used for the LSTM.
        batch_size (int): The batch size.
        alpha (float): The alpha value.
        dropout (float): The dropout value.
        time_precision (TimePrecision): The time precision. Only NS is supported.
        lr (float): The learning rate.
        epochs (int): The number of epochs.
        importance_weight (str): The importance weight. (set to a default value as in the RMTPP implementation)
        verbose_step (int): The verbose step, just for logging purposes.
        cuda (bool): Whether to use the GPU.
        absolute_frequency_distribution (Counter): The absolute frequency distribution of the classes.
        case_id_le (LabelEncoder): The case ID label encoder.
        activity_le (LabelEncoder): The activity label encoder.
        exponent (int): The exponent used for the time conversion (see the preprocessing module).
        number_classes (int): The number of possible activities in the data.
        case_activity_key (str): The case activity key.
        case_timestamp_key (str): The case timestamp key.
        case_id_key (str): The case ID key.
    """
    def __init__(self):
        self.seq_len: int= 10
        self.emb_dim: int= 1500
        self.hid_dim:int=1500
        self.mlp_dim:int= 1500
        self.batch_size:int = 1024
        self.alpha: float= 0.05
        self.dropout:float= 0.1
        self.time_precision:time_precision.TimePrecision = time_precision.TimePrecision.NS
        self.lr:float = 1e-3
        self.epochs: int = 3 
        self.importance_weight: str = "store_true"
        self.verbose_step: int = 350
        self.cuda: bool = False 
        self.absolute_frequency_distribution:Counter = Counter()
        self.case_id_le:LabelEncoder = None
        self.activity_le:LabelEncoder = None
        self.exponent:int = None
        self.number_classes:int = 0
        self.case_activity_key: str=""
        self.case_timestamp_key:str =""
        self.case_id_key:str = ""
    def asdict(self):
        """
        used for exporting as a dictionary 
        """
        return {
            "time_precision": self.time_precision.name,
            "seq_len": self.seq_len,
            "emb_dim": self.emb_dim,
            "hid_dim": self.hid_dim,
            "mlp_dim": self.mlp_dim,
            "alpha": self.alpha,
            "dropout": self.dropout,
            "batch_size": self.batch_size,
            "lr": self.lr,
            "epochs": self.epochs,
            "importance_weight": self.importance_weight,
            "verbose_step": self.verbose_step,
            "cuda": self.cuda,
            "absolute_frequency_distribution": dict(self.absolute_frequency_distribution),
            "case_id_le": self.encoder_to_dict(self.case_id_le),
            "activity_le": self.encoder_to_dict(self.activity_le),
            "exponent": self.exponent,
            "number_classes": self.number_classes,
            "case_id_key": self.case_id_key,
            "case_timestamp_key":self.case_timestamp_key, 
            "case_activity_key": self.case_activity_key
        }
    def load_config(self, dic):
        """
        used for importing
        """
        self.time_precision= time_precision.TimePrecision[dic["time_precision"]] 
        self.seq_len=int(dic["seq_len"])
        self.emb_dim=int(dic["emb_dim"])
        self.hid_dim=int(dic["hid_dim"])
        self.mlp_dim=int(dic["mlp_dim"])
        self.alpha=float(dic["alpha"])
        self.dropout=float(dic["dropout"])
        self.batch_size=int(dic["batch_size"])
        self.lr=float(dic["lr"])
        self.epochs=int(dic["epochs"])
        self.importance_weight =dic["importance_weight"] #string
        self.verbose_step = int(dic["verbose_step"])
        self.cuda = True if dic["cuda"]=="True" else False  
        self.absolute_frequency_distribution = Counter(["absolute_frequency_distribution"])
        self.case_id_le = self.dict_to_encoder(dic["case_id_le"])
        self.activity_le = self.dict_to_encoder(dic["activity_le"])
        self.exponent =int(dic["exponent"])
        self.number_classes =int(dic["number_classes"])
        self.case_activity_key = dic["case_activity_key"]
        self.case_id_key = dic["case_id_key"]
        self.case_timestamp_key = dic["case_timestamp_key"]

    def encoder_to_dict(self, encoder:LabelEncoder)->dict:
        """
        cast the encoder to a dictionary 
        """
        return {label:index for index, label in enumerate(encoder.classes_)} 

    def dict_to_encoder(self, dic:dict)->LabelEncoder:
        """
        cast the dictionary to an encoder
        """

        encoder = LabelEncoder()
        encoder.classes_ = np.array(list(dic.keys()))
        return encoder

asdict()

used for exporting as a dictionary

Source code in server/nn_manager.py
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
def asdict(self):
    """
    used for exporting as a dictionary 
    """
    return {
        "time_precision": self.time_precision.name,
        "seq_len": self.seq_len,
        "emb_dim": self.emb_dim,
        "hid_dim": self.hid_dim,
        "mlp_dim": self.mlp_dim,
        "alpha": self.alpha,
        "dropout": self.dropout,
        "batch_size": self.batch_size,
        "lr": self.lr,
        "epochs": self.epochs,
        "importance_weight": self.importance_weight,
        "verbose_step": self.verbose_step,
        "cuda": self.cuda,
        "absolute_frequency_distribution": dict(self.absolute_frequency_distribution),
        "case_id_le": self.encoder_to_dict(self.case_id_le),
        "activity_le": self.encoder_to_dict(self.activity_le),
        "exponent": self.exponent,
        "number_classes": self.number_classes,
        "case_id_key": self.case_id_key,
        "case_timestamp_key":self.case_timestamp_key, 
        "case_activity_key": self.case_activity_key
    }

dict_to_encoder(dic)

cast the dictionary to an encoder

Source code in server/nn_manager.py
132
133
134
135
136
137
138
139
def dict_to_encoder(self, dic:dict)->LabelEncoder:
    """
    cast the dictionary to an encoder
    """

    encoder = LabelEncoder()
    encoder.classes_ = np.array(list(dic.keys()))
    return encoder

encoder_to_dict(encoder)

cast the encoder to a dictionary

Source code in server/nn_manager.py
126
127
128
129
130
def encoder_to_dict(self, encoder:LabelEncoder)->dict:
    """
    cast the encoder to a dictionary 
    """
    return {label:index for index, label in enumerate(encoder.classes_)} 

load_config(dic)

used for importing

Source code in server/nn_manager.py
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
def load_config(self, dic):
    """
    used for importing
    """
    self.time_precision= time_precision.TimePrecision[dic["time_precision"]] 
    self.seq_len=int(dic["seq_len"])
    self.emb_dim=int(dic["emb_dim"])
    self.hid_dim=int(dic["hid_dim"])
    self.mlp_dim=int(dic["mlp_dim"])
    self.alpha=float(dic["alpha"])
    self.dropout=float(dic["dropout"])
    self.batch_size=int(dic["batch_size"])
    self.lr=float(dic["lr"])
    self.epochs=int(dic["epochs"])
    self.importance_weight =dic["importance_weight"] #string
    self.verbose_step = int(dic["verbose_step"])
    self.cuda = True if dic["cuda"]=="True" else False  
    self.absolute_frequency_distribution = Counter(["absolute_frequency_distribution"])
    self.case_id_le = self.dict_to_encoder(dic["case_id_le"])
    self.activity_le = self.dict_to_encoder(dic["activity_le"])
    self.exponent =int(dic["exponent"])
    self.number_classes =int(dic["number_classes"])
    self.case_activity_key = dic["case_activity_key"]
    self.case_id_key = dic["case_id_key"]
    self.case_timestamp_key = dic["case_timestamp_key"]

NNManagement

This is the NNMamangement class.

Provided functinality
  • Train the model based on the event log.
  • Test the model based on the event log.
  • Set params.
Source code in server/nn_manager.py
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
class NNManagement: 
    """
    This is the NNMamangement class. 

    Provided functinality: 
        - Train the model based on the event log. 
        - Test the model based on the event log.
        - Set params. 

    """
    def __init__(self, config:Config|None = None):
        self.config = Config() if config == None else config
        self.f1 = None
        self.recall= None
        self.acc = None
        self.time_error = None


    def evaluate(self):
        """
        This is the testing function for the model. It prints out the time_error, precision, recall, and f1 score.

        Returns:
            time_error (float): The time error.
            acc (float): The accuracy.
            recall (float): The recall.
            f1 (float): The F1 score.
        """
        #: main testing function
        self.model.eval()
        pred_times, pred_events = [], [] #inputs/training data
        gold_times, gold_events = [], [] #targets

        for i, batch in enumerate(tqdm(self.test_loader)):
            #batch: pair with two tensors, each containing respectively the time and event data.  
            gold_times.append(batch[0][:, -1].numpy()) # extract for each sequence the last time stamp/ the last event
            gold_events.append(batch[1][:, -1].numpy())
            pred_time, pred_event = self.model.predict(batch)
            if np.isnan(pred_times).any():
                raise exceptions.NaNException()
            pred_times.append(pred_time)
            pred_events.append(pred_event)

        pred_times = np.concatenate(pred_times).reshape(-1)
        print(type(pred_times))
        gold_times = np.concatenate(gold_times).reshape(-1)
        pred_events = np.concatenate(pred_events).reshape(-1)
        gold_events = np.concatenate(gold_events).reshape(-1)
        self.time_error = RMTPP_torch.abs_error(pred_times, gold_times)  #compute errors

        self.acc, self.recall, self.f1 = RMTPP_torch.clf_metric(pred_events, gold_events, n_class=self.config.number_classes) #get the metrics
        print(f"time_error: {self.time_error}, PRECISION: {self.acc}, RECALL: {self.recall}, F1: {self.f1}")

        return self.time_error, self.acc, self.recall, self.f1

    def get_training_statistics(self):
        """
        Returns:
            str: The accuracy, recall, and F1 score as a JSON object in string format.
        """
        if self.acc == None and self.recall == None and self.f1 ==None: 
            raise exceptions.ModelNotTrainedYet()

        #: dumps generates a string
        return {
            "time error": str(self.time_error),
            "acc":self.acc, 
            "recall":self.recall,
            "f1":self.f1
        }

    def import_nn_model(self, path):
        """
        imports a .pt file
        """
        saved_model_contents = torch.load(path)
        config = saved_model_contents['config']
        lossweight = saved_model_contents['lossweight']
        self.model = RMTPP_torch.Net(config, lossweight)
        if config.cuda: 
            self.model.to("cuda")
        else:
            self.model.to("cpu")
        self.model.load_state_dict(saved_model_contents['model_state_dict'])
        self.model.optimizer.load_state_dict(saved_model_contents['optimizer_state_dict'])
        self.model.eval() # relevant for droput layers.


    def export_nn_model(self, name="trained_model.pt"):
        """
        generates the .pt file containing the generated
        model. 

        model state dict contains
        optimizer state dict
        """
        dic = {
            'model_state_dict': self.model.state_dict(),
            'optimizer_state_dict': self.model.optimizer.state_dict(),
            'config': self.model.config, 
            'lossweight': self.model.lossweight
        }
        torch.save(dic, name)

    def random_search(self, search_parameters, iterations): 
        """
        Random search for the best hyperparameters. Saves the best model in the class.

        We only do this for the hid_dim, mlp_dim and emb_dim parameters. (decided arbitrarily, can be extended to other parameters as well.)

        Args:
            search_parameters (dict): Dictionary containing the search parameters.
                - 'hid_dim': [start, end]
                - 'mlp_dim': [start, end]
                - 'emb_dim': [start, end]
            iterations (int): Number of iterations.

        Returns:
            float: The best accuracy.
        """
        acc = 0
        best_model = None
        # self.load_data(train, test, case_id_key, timestamp_key, case_activity_key)
        for i in range(iterations): 
            a=random.randint(search_parameters["hid_dim"][0], search_parameters["hid_dim"][1])
            b=random.randint(search_parameters["mlp_dim"][0], search_parameters["mlp_dim"][1])
            c=  random.randint(search_parameters["emb_dim"][0], search_parameters["emb_dim"][1])
            self.config.hid_dim = a
            self.config.emb_dim= b
            self.config.mlp_dim=c
            self.train()
            if self.acc> acc: 
                self.config.hid_dim = a
                self.config.emb_dim= b
                self.config.mlp_dim=c
                acc = self.acc
                best_model = self.model
        self.model = best_model
        print(f"best accuracy: {acc}")
        return acc

    def grid_search(self, search_parameters): 
        """
        Grid search for the best hyperparameters.

        We only do this for the hid_dim, mlp_dim and emb_dim parameters. (decided arbitrarily, can be extended to other parameters as well.)

        Args:
            search_parameters (dict): Dictionary containing the search parameters.
            - 'hid_dim': [start, end, step]
            - 'mlp_dim': [start, end, step]
            - 'emb_dim': [start, end, step]

        Returns:
            float: The best accuracy.
        """
        acc = 0
        best_model = None
        # self.load_data(train, test, case_id_key, timestamp_key, case_activity_key)
        for i in range(search_parameters["hid_dim"][0], search_parameters["hid_dim"][1], search_parameters["hid_dim"][2]): 
                self.config.hid_dim =i 
                for j in range(search_parameters["mlp_dim"][0], search_parameters["mlp_dim"][1], search_parameters["mlp_dim"][2]): 
                    self.config.mlp_dim=j
                    for k in range(search_parameters["emb_dim"][0], search_parameters["emb_dim"][1], search_parameters["emb_dim"][2]):
                        self.config.emb_dim=k
                        self.config.our_implementation = True
                        self.train()
                        best_model = self.model
                        if self.acc> acc: 
                            acc = self.acc
        self.model = best_model
        print(f"best acc: {acc}")
        return acc

    def load_data(self,train_data, test_data, case_id, timestamp_key, event_key ):
        """
        imports a training and testing sublogs, which were preprocessed by the preprocessing module.

        it applies the sliding window algorithm to have subsequences of the same fixed length. 
        The output is passed to the respective DataLoader object, which computes the time differences 
        and casts the input to tensors;  generates the batches.
        """

        # apply sliding window
        train_set = RMTPP_torch.ATMDataset(self.config ,train_data, case_id,   timestamp_key, event_key ) 
        test_set =  RMTPP_torch.ATMDataset(self.config , test_data, case_id, timestamp_key, event_key)
        # generate time differences and load the data to torch tensors and generate the batches. 
        self.train_loader = DataLoader(train_set, batch_size=self.config.batch_size, shuffle=True, collate_fn= RMTPP_torch.ATMDataset.to_features)
        self.test_loader = DataLoader(test_set, batch_size=self.config.batch_size, shuffle=False, collate_fn= RMTPP_torch.ATMDataset.to_features)

        #: initialize a matrix to store the importance weights that will be passed to the CrossEntropyLoss object. 
        self.weight = np.ones(self.config.number_classes)
        if self.config.importance_weight:
            self.weight = train_set.importance_weight(self.config.absolute_frequency_distribution)



    def train(self):
            """
            This is the main training function.

            Args:
                train_data (DataFrame): The training data.
                test_data (DataFrame): The test data.
                case_id (str): The column name of the case ID in the data.
                timestamp_key (str): The key of the timestamp in the data.
                no_classes (int): The number of known markers.
            """
            # we already pass the split data to de ATM loader. ATMDAtaset uses the sliding window for generating the input for training.
            # since we are using tensors for training the sequence length remains fixed in each epoch, hence we cannot do "arbitrary length cuts" 
            # to the training data
            self.model =  RMTPP_torch.Net(self.config, lossweight=self.weight) #crete a NN instance
            self.model.set_optimizer(total_step=len(self.train_loader) * self.config.epochs) 


            if self.config.cuda: 
                self.model.cuda() #GPU 

            for epc in range(self.config.epochs): #run the epochs
                self.model.train()  
                range_loss1 = range_loss2 = range_loss = 0
                for i, batch in enumerate(tqdm(self.train_loader)):

                    l1, l2, l = self.model.train_batch(batch) 
                    range_loss1 += l1
                    range_loss2 += l2
                    range_loss += l

                    if (i + 1) % self.config.verbose_step == 0:
                        print("time loss: ", range_loss1 / self.config.verbose_step)
                        print("event loss:", range_loss2 / self.config.verbose_step)
                        print("total loss:", range_loss / self.config.verbose_step)
                        range_loss1 = range_loss2 = range_loss = 0


            self.evaluate()

evaluate()

This is the testing function for the model. It prints out the time_error, precision, recall, and f1 score.

Returns:

Name Type Description
time_error float

The time error.

acc float

The accuracy.

recall float

The recall.

f1 float

The F1 score.

Source code in server/nn_manager.py
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
def evaluate(self):
    """
    This is the testing function for the model. It prints out the time_error, precision, recall, and f1 score.

    Returns:
        time_error (float): The time error.
        acc (float): The accuracy.
        recall (float): The recall.
        f1 (float): The F1 score.
    """
    #: main testing function
    self.model.eval()
    pred_times, pred_events = [], [] #inputs/training data
    gold_times, gold_events = [], [] #targets

    for i, batch in enumerate(tqdm(self.test_loader)):
        #batch: pair with two tensors, each containing respectively the time and event data.  
        gold_times.append(batch[0][:, -1].numpy()) # extract for each sequence the last time stamp/ the last event
        gold_events.append(batch[1][:, -1].numpy())
        pred_time, pred_event = self.model.predict(batch)
        if np.isnan(pred_times).any():
            raise exceptions.NaNException()
        pred_times.append(pred_time)
        pred_events.append(pred_event)

    pred_times = np.concatenate(pred_times).reshape(-1)
    print(type(pred_times))
    gold_times = np.concatenate(gold_times).reshape(-1)
    pred_events = np.concatenate(pred_events).reshape(-1)
    gold_events = np.concatenate(gold_events).reshape(-1)
    self.time_error = RMTPP_torch.abs_error(pred_times, gold_times)  #compute errors

    self.acc, self.recall, self.f1 = RMTPP_torch.clf_metric(pred_events, gold_events, n_class=self.config.number_classes) #get the metrics
    print(f"time_error: {self.time_error}, PRECISION: {self.acc}, RECALL: {self.recall}, F1: {self.f1}")

    return self.time_error, self.acc, self.recall, self.f1

export_nn_model(name='trained_model.pt')

generates the .pt file containing the generated model.

model state dict contains optimizer state dict

Source code in server/nn_manager.py
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
def export_nn_model(self, name="trained_model.pt"):
    """
    generates the .pt file containing the generated
    model. 

    model state dict contains
    optimizer state dict
    """
    dic = {
        'model_state_dict': self.model.state_dict(),
        'optimizer_state_dict': self.model.optimizer.state_dict(),
        'config': self.model.config, 
        'lossweight': self.model.lossweight
    }
    torch.save(dic, name)

get_training_statistics()

Returns:

Name Type Description
str

The accuracy, recall, and F1 score as a JSON object in string format.

Source code in server/nn_manager.py
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
def get_training_statistics(self):
    """
    Returns:
        str: The accuracy, recall, and F1 score as a JSON object in string format.
    """
    if self.acc == None and self.recall == None and self.f1 ==None: 
        raise exceptions.ModelNotTrainedYet()

    #: dumps generates a string
    return {
        "time error": str(self.time_error),
        "acc":self.acc, 
        "recall":self.recall,
        "f1":self.f1
    }

Grid search for the best hyperparameters.

We only do this for the hid_dim, mlp_dim and emb_dim parameters. (decided arbitrarily, can be extended to other parameters as well.)

Parameters:

Name Type Description Default
search_parameters dict

Dictionary containing the search parameters.

required
- hid_dim

[start, end, step]

required
- mlp_dim

[start, end, step]

required
- emb_dim

[start, end, step]

required

Returns:

Name Type Description
float

The best accuracy.

Source code in server/nn_manager.py
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
def grid_search(self, search_parameters): 
    """
    Grid search for the best hyperparameters.

    We only do this for the hid_dim, mlp_dim and emb_dim parameters. (decided arbitrarily, can be extended to other parameters as well.)

    Args:
        search_parameters (dict): Dictionary containing the search parameters.
        - 'hid_dim': [start, end, step]
        - 'mlp_dim': [start, end, step]
        - 'emb_dim': [start, end, step]

    Returns:
        float: The best accuracy.
    """
    acc = 0
    best_model = None
    # self.load_data(train, test, case_id_key, timestamp_key, case_activity_key)
    for i in range(search_parameters["hid_dim"][0], search_parameters["hid_dim"][1], search_parameters["hid_dim"][2]): 
            self.config.hid_dim =i 
            for j in range(search_parameters["mlp_dim"][0], search_parameters["mlp_dim"][1], search_parameters["mlp_dim"][2]): 
                self.config.mlp_dim=j
                for k in range(search_parameters["emb_dim"][0], search_parameters["emb_dim"][1], search_parameters["emb_dim"][2]):
                    self.config.emb_dim=k
                    self.config.our_implementation = True
                    self.train()
                    best_model = self.model
                    if self.acc> acc: 
                        acc = self.acc
    self.model = best_model
    print(f"best acc: {acc}")
    return acc

import_nn_model(path)

imports a .pt file

Source code in server/nn_manager.py
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
def import_nn_model(self, path):
    """
    imports a .pt file
    """
    saved_model_contents = torch.load(path)
    config = saved_model_contents['config']
    lossweight = saved_model_contents['lossweight']
    self.model = RMTPP_torch.Net(config, lossweight)
    if config.cuda: 
        self.model.to("cuda")
    else:
        self.model.to("cpu")
    self.model.load_state_dict(saved_model_contents['model_state_dict'])
    self.model.optimizer.load_state_dict(saved_model_contents['optimizer_state_dict'])
    self.model.eval() # relevant for droput layers.

load_data(train_data, test_data, case_id, timestamp_key, event_key)

imports a training and testing sublogs, which were preprocessed by the preprocessing module.

it applies the sliding window algorithm to have subsequences of the same fixed length. The output is passed to the respective DataLoader object, which computes the time differences and casts the input to tensors; generates the batches.

Source code in server/nn_manager.py
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
def load_data(self,train_data, test_data, case_id, timestamp_key, event_key ):
    """
    imports a training and testing sublogs, which were preprocessed by the preprocessing module.

    it applies the sliding window algorithm to have subsequences of the same fixed length. 
    The output is passed to the respective DataLoader object, which computes the time differences 
    and casts the input to tensors;  generates the batches.
    """

    # apply sliding window
    train_set = RMTPP_torch.ATMDataset(self.config ,train_data, case_id,   timestamp_key, event_key ) 
    test_set =  RMTPP_torch.ATMDataset(self.config , test_data, case_id, timestamp_key, event_key)
    # generate time differences and load the data to torch tensors and generate the batches. 
    self.train_loader = DataLoader(train_set, batch_size=self.config.batch_size, shuffle=True, collate_fn= RMTPP_torch.ATMDataset.to_features)
    self.test_loader = DataLoader(test_set, batch_size=self.config.batch_size, shuffle=False, collate_fn= RMTPP_torch.ATMDataset.to_features)

    #: initialize a matrix to store the importance weights that will be passed to the CrossEntropyLoss object. 
    self.weight = np.ones(self.config.number_classes)
    if self.config.importance_weight:
        self.weight = train_set.importance_weight(self.config.absolute_frequency_distribution)

Random search for the best hyperparameters. Saves the best model in the class.

We only do this for the hid_dim, mlp_dim and emb_dim parameters. (decided arbitrarily, can be extended to other parameters as well.)

Parameters:

Name Type Description Default
search_parameters dict

Dictionary containing the search parameters. - 'hid_dim': [start, end] - 'mlp_dim': [start, end] - 'emb_dim': [start, end]

required
iterations int

Number of iterations.

required

Returns:

Name Type Description
float

The best accuracy.

Source code in server/nn_manager.py
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
def random_search(self, search_parameters, iterations): 
    """
    Random search for the best hyperparameters. Saves the best model in the class.

    We only do this for the hid_dim, mlp_dim and emb_dim parameters. (decided arbitrarily, can be extended to other parameters as well.)

    Args:
        search_parameters (dict): Dictionary containing the search parameters.
            - 'hid_dim': [start, end]
            - 'mlp_dim': [start, end]
            - 'emb_dim': [start, end]
        iterations (int): Number of iterations.

    Returns:
        float: The best accuracy.
    """
    acc = 0
    best_model = None
    # self.load_data(train, test, case_id_key, timestamp_key, case_activity_key)
    for i in range(iterations): 
        a=random.randint(search_parameters["hid_dim"][0], search_parameters["hid_dim"][1])
        b=random.randint(search_parameters["mlp_dim"][0], search_parameters["mlp_dim"][1])
        c=  random.randint(search_parameters["emb_dim"][0], search_parameters["emb_dim"][1])
        self.config.hid_dim = a
        self.config.emb_dim= b
        self.config.mlp_dim=c
        self.train()
        if self.acc> acc: 
            self.config.hid_dim = a
            self.config.emb_dim= b
            self.config.mlp_dim=c
            acc = self.acc
            best_model = self.model
    self.model = best_model
    print(f"best accuracy: {acc}")
    return acc

train()

This is the main training function.

Parameters:

Name Type Description Default
train_data DataFrame

The training data.

required
test_data DataFrame

The test data.

required
case_id str

The column name of the case ID in the data.

required
timestamp_key str

The key of the timestamp in the data.

required
no_classes int

The number of known markers.

required
Source code in server/nn_manager.py
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
def train(self):
        """
        This is the main training function.

        Args:
            train_data (DataFrame): The training data.
            test_data (DataFrame): The test data.
            case_id (str): The column name of the case ID in the data.
            timestamp_key (str): The key of the timestamp in the data.
            no_classes (int): The number of known markers.
        """
        # we already pass the split data to de ATM loader. ATMDAtaset uses the sliding window for generating the input for training.
        # since we are using tensors for training the sequence length remains fixed in each epoch, hence we cannot do "arbitrary length cuts" 
        # to the training data
        self.model =  RMTPP_torch.Net(self.config, lossweight=self.weight) #crete a NN instance
        self.model.set_optimizer(total_step=len(self.train_loader) * self.config.epochs) 


        if self.config.cuda: 
            self.model.cuda() #GPU 

        for epc in range(self.config.epochs): #run the epochs
            self.model.train()  
            range_loss1 = range_loss2 = range_loss = 0
            for i, batch in enumerate(tqdm(self.train_loader)):

                l1, l2, l = self.model.train_batch(batch) 
                range_loss1 += l1
                range_loss2 += l2
                range_loss += l

                if (i + 1) % self.config.verbose_step == 0:
                    print("time loss: ", range_loss1 / self.config.verbose_step)
                    print("event loss:", range_loss2 / self.config.verbose_step)
                    print("total loss:", range_loss / self.config.verbose_step)
                    range_loss1 = range_loss2 = range_loss = 0


        self.evaluate()

Prediction management module

This module is in charge of administrating prediction generation.

The two following of predictions can be made
  • single predictions (one step in the future and get the most likely (event, timestamp) pair)
  • multiple predictions (generate a predictive tree). these can be saved in a file.

Predictions are also decoded.

This module is also used by the process_model_manager module, which calls the multiple prediction manager repeatedly. Since this other manager supports different options in relation to how the cut sequences should be restored, the parametrized function multiple_prediction_linear is implemented; which grants some runtime benefits.

PredictionManager

Source code in server/prediction_manager.py
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
class PredictionManager: 
    def __init__(self, model, case_id_key, activity_key, timestamp_key, config):
        """
        Initializes the PredictionManager object.

        Args:
            model (object): The model used for doing predictions.
            case_id_key (str): The case id key of the log.
            activity_key (str): The activity key of the log.
            timestamp_key (str): The timestamp key of the log.
            config (Config): The configuration used for training and important hyperparameters.
        """
        self.model = model  # We assume there is an already existing model.
        self.case_id_key = case_id_key  # We assume the three important columns are known.
        self.activity_key = activity_key
        self.timestamp_key = timestamp_key
        self.config = config

        self.current_case_id = None
        self.paths = [] #: generated paths (multiple prediction generation)
        self.decoded_paths = [] #: decoded paths (multiple prediction generation)
        self.encoded_df = None 


        #: the followwing arrays are used for backtracking. 
        self.recursive_event_seqs = []  
        self.recursive_time_seqs = []
        self.recursive_time_diffs = []
        self.end_activities = {}


    def get_dummy_process(self, df, case_id_column):
        """
        just used for testing; create a dummy input df.
        """
        case_ids = df[case_id_column].unique()
        selected_id = -1
        length = 0
        for id in case_ids: 
            length = len(df[df[case_id_column]==id])
            if length>=self.config.seq_len:
                selected_id = id
                break
        if selected_id == -1: #: no sequence of appropiate length for input
            raise exceptions.SeqLengthTooHigh()             

        random_cut = random.randint(self.config.seq_len+1, length ) 
        dummy = df[df[case_id_column]==selected_id]

        return dummy.iloc[:random_cut]

    def check_input_uniqueness(self):
        """
        the input df must contain only one process. hence check if thereis one unique case_id 
        """
        return len(self.encoded_df[self.case_id_key].unique()) == 1

    def single_prediction_dataframe(self, df):
        """
        make one prediction given a dataframe. 
        preprocessor is in charge of doing the
        reading/importing from csv, xes, commandline, etc...
        """
        self.encoded_df = df

        #: check case id uniqueness
        if not self.check_input_uniqueness():
            raise exceptions.NotOneCaseId()
        self.current_case_id = self.encoded_df[self.case_id_key].sample(n = 1).values[0]
        return self.single_prediction()

    def single_prediction(self):
        """
        make one prediction given a partial process. 
        """
        #: sliding window

        step1 = RMTPP_torch.ATMDataset(self.config,self.encoded_df, self.case_id_key, self.timestamp_key, self.activity_key)
        #: just create one batch
        step2 = DataLoader(step1, batch_size=len(step1.time_seqs), shuffle=False, collate_fn=RMTPP_torch.ATMDataset.to_features)

        #: get the batch
        batch = next(iter(step2))
        event_pred, prob, time_pred= self.model.predict(batch, pm_active = True)

        return time_pred, event_pred, prob

    def jsonify_single(self, time_pred, event_pred, prob): 
        """
        note that we just save the
        probability of the last pair (time, event) in the path, 
        since the nn calculates lambda*(t) (see paper), which is 
        the probability of the last predicted event happening
        in the predicted time t. 
        """
        # decode event
        decoded_event = self.config.activity_le.inverse_transform([event_pred])

        # decode the timestamp
        timestamps = self.encoded_df[self.timestamp_key].copy()


        timestamps = timestamps*(10**(self.config.exponent))

        #: this -5 displacement is used to tackle the accuacy problem. it is set
        # arbitrarily.
        time_pred = time_pred*(10**(self.config.exponent-5)) 

        timestamps.iloc[-1]= timestamps.iloc[-1]+ time_pred
        timestamps = timestamps.astype("datetime64[ns]")
        new_time = timestamps.iloc[-1]

        ans = {
            "predicted_time":str(new_time), 
            "predicted_event":decoded_event[0], 
            "probability": prob
        }
        return json.dumps(ans)



    def get_differences(self):
        """
        calculates time differences. 
        """
        local = []
        for seq in self.recursive_time_seqs:
            seq = np.insert(seq, 0, seq[0])
            seq= np.diff(seq)
            local.append(seq)
        return np.array(local)


    def append_one_difference_array(self, lst):
        """
        Appends one difference array to self.recursive_time_diffs.

        Args:
            lst (list): List used for calculating the contiguous differences.
        """
        #: Extend the list by one element
        time = np.array([lst[0]]+ lst)
        #: Get the differences between contiguous elements.
        time = np.diff(time)
        self.recursive_time_diffs= np.append(self.recursive_time_diffs, [time], axis = 0)

    def multiple_prediction_linear(self, depth, nonstop ,upper): 
        """
        this is a special case of multiple prediction
        where the degree= 1. we avoid backtracking and recursion for
        efficiency reasons.  
        """
        #: get the event and timestamp of the last event in the partial process.
        c_t =self.encoded_df[self.timestamp_key].iloc[-1]
        c_e =self.encoded_df[self.activity_key].iloc[-1]

        #: compute sliding window
        self.recursive_atm = RMTPP_torch.ATMDataset(self.config, self.encoded_df, self.case_id_key, self.timestamp_key, self.activity_key, True)
        self.recursive_time_seqs = self.recursive_atm.time_seqs
        self.recursive_event_seqs = self.recursive_atm.event_seqs

        #: compute differences within each window
        self.recursive_time_diffs= self.get_differences()


        if nonstop:
            #: in case that we run until end activity
            self.linear_iterative_predictor_non_stop(c_t, c_e, upper)
        else:
            #: in case we run until a given depth
            self.linear_iterative_predictor(depth, c_t, c_e)


    def linear_iterative_predictor_non_stop(self, start_time, start_event, upper=float("inf")):
        """
        Predicts the path of events iteratively until an end activity is found or the upper bound is reached.

        Args:
            start_time (float): The start time of the path.
            start_event: The start event of the path.
            upper (float, optional): The upper bound for the amount of iterations. Defaults to float("inf").
        """
        c_t = start_time
        c_e = start_event
        path = [(c_t, (1, c_e))]
        i = 0
        while not self.end_activities[c_e] and i < upper:
            p_t, p_events = self.get_sorted_wrapper()
            p_pair = p_events[0]
            path.append((p_t[0], (p_pair[0], p_pair[1])))
            self.append_to_log(p_t[0], p_pair[1])
            c_t = p_t[0]
            c_e = p_pair[0]
            i += 1
        self.paths.append(path)


    def linear_iterative_predictor(self, depth, start_time, start_event): 
        """
        makes predictions linearly (ie no backtracking and branching degree = 1) , and also 
        iteratively (no recursion)
        """
        c_t = start_time
        c_e = start_event
        path = [(c_t , (1,c_e))]
        for i in range(depth): 
            p_t, p_events = self.get_sorted_wrapper()
            p_pair = p_events[0]
            path.append((p_t[0], (p_pair[0], p_pair[1] )))
            self.append_to_log(p_t[0], p_pair[1])
        self.paths.append(path)

    def multiple_prediction(self, depth, degree): 
            """
            Get a list of possible paths starting at the last timestamp and event pair.

            Args:
                depth (int): The number of steps in the future to be predicted.
                degree (int): The number of predictions on each step to be considered.


            This method loads data, gets windows, computes paths, and decodes paths.
            It requires the configuration used for the NN, which is required by the ATM Dataset.
            """
            c_t = self.encoded_df[self.timestamp_key].iloc[-1]
            c_e = self.encoded_df[self.activity_key].iloc[-1]

            #:load data, get windows
            self.recursive_atm = RMTPP_torch.ATMDataset(self.config, self.encoded_df, self.case_id_key, self.timestamp_key, self.activity_key, True)
            self.recursive_time_seqs = self.recursive_atm.time_seqs
            self.recursive_event_seqs = self.recursive_atm.event_seqs

            #:get differences
            self.recursive_time_diffs = self.get_differences()

            #: compute paths
            self.backtracking_prediction_tree(c_t, c_e, 0, depth, degree, [(c_t, (1, c_e))]) 

            #: decode paths
            self.decode_paths()


    def backtracking_prediction_tree(self, c_t, c_e, c_d, depth, degree,current_path):
        """
        use backtracking to generate all the paths from the given 
        last timestamp and marker considering the input degree as a threshold 
        and the maximum depth for the generated tree.
        """
        if c_d >= depth: 
            # base case
            self.paths.append(list(current_path))
            return
        p_t, p_events = self.get_sorted_wrapper( )
        for p_e in p_events[:degree]:
            # filter branching degree ; the list is already sorted
            # therefore the "degree" most probable are taken
            self.append_to_log(p_t[0], p_e[1]) 
            current_path.append((p_t[0], p_e))
            self.backtracking_prediction_tree(p_t[0], p_e[1], c_d+1, depth, degree, list(current_path))    
            current_path.pop() 
            self.pop_from_log()

    def get_sorted_wrapper(self):

        #: check whether the number of rows in 
        # self.encoded_df <= seq_len. otherwise the
        # processed data by dataloader/atmdataset 
        # output empty lists... 
        if self.config.seq_len>= len(self.encoded_df):
            raise exceptions.SeqLengthTooHigh()

        self.recursive_atm.event_seqs = self.recursive_event_seqs
        self.recursive_atm.time_seqs = self.recursive_time_seqs


        #: do not use dataloader for batch genertaion, too inefficient
        #: differnces are also computed in a smarter way, as well as windows.
        batch = ( torch.tensor(self.recursive_time_diffs,dtype=torch.float32), torch.tensor(self.recursive_event_seqs, dtype=torch.int64)) 

        pred_times, pred_events = [], []
        pred_time, pred_event = self.model.predict_sorted(batch)

        pred_times.append(pred_time)
        pred_events.append(pred_event)

        pred_times= pred_times[-1][-1] #we are only interested in the last one; unpack the batch
        pred_events = pred_events[-1][-1]

        return pred_times, pred_events


    def append_to_log(self, time, event): 
        """
        Appends a window and a difference array to an existing list instead of calling ATMDataset and Dataloader on each iterative call of the prediction generator.

        Args:
            time (float): The newly predicted timestamp.
            event: The newly predicted event.
        """
        last_time_seq = list(self.recursive_time_seqs[-1])
        last_event_seq = list(self.recursive_event_seqs[-1])
        new_time_seq = last_time_seq[1:]
        new_event_seq = last_event_seq[1:]
        new_time_seq.append(time)
        new_event_seq.append(event)
        self.recursive_time_seqs.append(new_time_seq)
        self.recursive_event_seqs.append(new_event_seq)
        self.append_one_difference_array(new_time_seq)

    def pop_from_log(self): 
        """
        used for backtracking to restore the old path
        """
        self.recursive_time_seqs.pop()
        self.recursive_event_seqs.pop()
        self.recursive_time_diffs= np.delete(self.recursive_time_diffs, len(self.recursive_time_diffs)-1, axis = 0) 

    def decode_paths(self):
        """
        used for decoding the events and timestamps in the generated paths. 
        The timestamps are NOT decoded, since the predictions are TIMEDELTAS
        """

        self.decoded_paths = []
        for path in self.paths: 
            encoded_events = [event_index for _, (_, event_index) in path]
            encoded_events = list(map(int, encoded_events))
            di = self.config.encoder_to_dict(self.config.activity_le)

            decoded_events = self.config.activity_le.inverse_transform(encoded_events)

            decoded_path= [(time, (prob, event)) for (time, (prob, _)), event in zip(path, decoded_events) ]
            self.decoded_paths.append(decoded_path)

    def jsonify_paths(self):
        """
        note that we just save the
        probability of the last pair (time, event) in the path, 
        since the nn calculates lambda*(t), which is 
        the probability of the last predicted event happening
        in the predicted time t. 

        paths markers are assumed to be decoded. 
        """
        ans = {
            "paths": []
        }
        for path in self.decoded_paths :
            current_path = {
                'pairs':[], 
                'percentage': str(path[-1][1][0])
            }
            first_time, (first_percentage, first_event) = path[0]
            rest = path[1:]

            print(first_time)
            first_time = first_time*(10**self.config.exponent)
            #: rescale the timedeltas


            #: the -5 displacement is used to tackle the accuacy problem. it is set
            rest_times = [math.ceil(time*(10**(self.config.exponent-5))) for time, (prob, event) in rest] 
            for i in range(1,len(rest_times)-1):  #:calculate cumsum
                rest_times[i] = rest_times[i]+ rest_times[i-1] 

            #: calculate the timestamp based on the first event 
            rest_times = [time+first_time for time in rest_times]
            times= [first_time] + rest_times
            temp_df = pd.DataFrame({"times": times})
            temp_df["times"] = temp_df["times"].astype("datetime64[ns]") #: a dataframe is used because we want to revese the encoding 
            # with the same algorithm (in thie case provided by pandas)
            times_decoded = temp_df["times"].tolist()


            for path_element, decoded_time in zip(path, times_decoded):
                current_path["pairs"].append(
                    {
                        "time":str(decoded_time) , 
                        "event": str(path_element[1][1]), 
                    }
                )
            ans["paths"].append(current_path)
        return ans

    def multiple_prediction_dataframe(self, depth, degree, df, linear=False, non_stop = False, upper = 30):
        """
        make multiple predictions given a dataframe
        preprocessor is in charge of doing the
        reading/importing from csv, xes, commandline, etc...
        it is assumed that the event log contains only one case id.
        """

        """
        preprocessor = preprocessing.Preprocessing()

        #; its important to pass the existing encoders to avoid recomputation of the language encoders!

        preprocessor.import_event_log_dataframe(df, self.case_id_key, self.activity_key, self.timestamp_key)

        self.encoded_df= preprocessor.event_df 
        """
        self.encoded_df = df
        self.current_case_id= self.encoded_df[self.case_id_key].sample(n = 1).values[0]


        di = self.config.encoder_to_dict(self.config.activity_le)
        if not linear: 
            """
            case backtracking needed; never called by prediction manager.  
            """
            self.multiple_prediction(depth, degree)
        else: 
            """
            the linear version is only being used for
            the iterative use of the multiple prediction generation.
            it mostly contains optimizations that are only 
            tought for the prediction manager. 
            """
            self.multiple_prediction_linear(depth, non_stop, upper)

__init__(model, case_id_key, activity_key, timestamp_key, config)

Initializes the PredictionManager object.

Parameters:

Name Type Description Default
model object

The model used for doing predictions.

required
case_id_key str

The case id key of the log.

required
activity_key str

The activity key of the log.

required
timestamp_key str

The timestamp key of the log.

required
config Config

The configuration used for training and important hyperparameters.

required
Source code in server/prediction_manager.py
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
def __init__(self, model, case_id_key, activity_key, timestamp_key, config):
    """
    Initializes the PredictionManager object.

    Args:
        model (object): The model used for doing predictions.
        case_id_key (str): The case id key of the log.
        activity_key (str): The activity key of the log.
        timestamp_key (str): The timestamp key of the log.
        config (Config): The configuration used for training and important hyperparameters.
    """
    self.model = model  # We assume there is an already existing model.
    self.case_id_key = case_id_key  # We assume the three important columns are known.
    self.activity_key = activity_key
    self.timestamp_key = timestamp_key
    self.config = config

    self.current_case_id = None
    self.paths = [] #: generated paths (multiple prediction generation)
    self.decoded_paths = [] #: decoded paths (multiple prediction generation)
    self.encoded_df = None 


    #: the followwing arrays are used for backtracking. 
    self.recursive_event_seqs = []  
    self.recursive_time_seqs = []
    self.recursive_time_diffs = []
    self.end_activities = {}

append_one_difference_array(lst)

Appends one difference array to self.recursive_time_diffs.

Parameters:

Name Type Description Default
lst list

List used for calculating the contiguous differences.

required
Source code in server/prediction_manager.py
166
167
168
169
170
171
172
173
174
175
176
177
def append_one_difference_array(self, lst):
    """
    Appends one difference array to self.recursive_time_diffs.

    Args:
        lst (list): List used for calculating the contiguous differences.
    """
    #: Extend the list by one element
    time = np.array([lst[0]]+ lst)
    #: Get the differences between contiguous elements.
    time = np.diff(time)
    self.recursive_time_diffs= np.append(self.recursive_time_diffs, [time], axis = 0)

append_to_log(time, event)

Appends a window and a difference array to an existing list instead of calling ATMDataset and Dataloader on each iterative call of the prediction generator.

Parameters:

Name Type Description Default
time float

The newly predicted timestamp.

required
event

The newly predicted event.

required
Source code in server/prediction_manager.py
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
def append_to_log(self, time, event): 
    """
    Appends a window and a difference array to an existing list instead of calling ATMDataset and Dataloader on each iterative call of the prediction generator.

    Args:
        time (float): The newly predicted timestamp.
        event: The newly predicted event.
    """
    last_time_seq = list(self.recursive_time_seqs[-1])
    last_event_seq = list(self.recursive_event_seqs[-1])
    new_time_seq = last_time_seq[1:]
    new_event_seq = last_event_seq[1:]
    new_time_seq.append(time)
    new_event_seq.append(event)
    self.recursive_time_seqs.append(new_time_seq)
    self.recursive_event_seqs.append(new_event_seq)
    self.append_one_difference_array(new_time_seq)

backtracking_prediction_tree(c_t, c_e, c_d, depth, degree, current_path)

use backtracking to generate all the paths from the given last timestamp and marker considering the input degree as a threshold and the maximum depth for the generated tree.

Source code in server/prediction_manager.py
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
def backtracking_prediction_tree(self, c_t, c_e, c_d, depth, degree,current_path):
    """
    use backtracking to generate all the paths from the given 
    last timestamp and marker considering the input degree as a threshold 
    and the maximum depth for the generated tree.
    """
    if c_d >= depth: 
        # base case
        self.paths.append(list(current_path))
        return
    p_t, p_events = self.get_sorted_wrapper( )
    for p_e in p_events[:degree]:
        # filter branching degree ; the list is already sorted
        # therefore the "degree" most probable are taken
        self.append_to_log(p_t[0], p_e[1]) 
        current_path.append((p_t[0], p_e))
        self.backtracking_prediction_tree(p_t[0], p_e[1], c_d+1, depth, degree, list(current_path))    
        current_path.pop() 
        self.pop_from_log()

check_input_uniqueness()

the input df must contain only one process. hence check if thereis one unique case_id

Source code in server/prediction_manager.py
84
85
86
87
88
def check_input_uniqueness(self):
    """
    the input df must contain only one process. hence check if thereis one unique case_id 
    """
    return len(self.encoded_df[self.case_id_key].unique()) == 1

decode_paths()

used for decoding the events and timestamps in the generated paths. The timestamps are NOT decoded, since the predictions are TIMEDELTAS

Source code in server/prediction_manager.py
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
def decode_paths(self):
    """
    used for decoding the events and timestamps in the generated paths. 
    The timestamps are NOT decoded, since the predictions are TIMEDELTAS
    """

    self.decoded_paths = []
    for path in self.paths: 
        encoded_events = [event_index for _, (_, event_index) in path]
        encoded_events = list(map(int, encoded_events))
        di = self.config.encoder_to_dict(self.config.activity_le)

        decoded_events = self.config.activity_le.inverse_transform(encoded_events)

        decoded_path= [(time, (prob, event)) for (time, (prob, _)), event in zip(path, decoded_events) ]
        self.decoded_paths.append(decoded_path)

get_differences()

calculates time differences.

Source code in server/prediction_manager.py
154
155
156
157
158
159
160
161
162
163
def get_differences(self):
    """
    calculates time differences. 
    """
    local = []
    for seq in self.recursive_time_seqs:
        seq = np.insert(seq, 0, seq[0])
        seq= np.diff(seq)
        local.append(seq)
    return np.array(local)

get_dummy_process(df, case_id_column)

just used for testing; create a dummy input df.

Source code in server/prediction_manager.py
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
def get_dummy_process(self, df, case_id_column):
    """
    just used for testing; create a dummy input df.
    """
    case_ids = df[case_id_column].unique()
    selected_id = -1
    length = 0
    for id in case_ids: 
        length = len(df[df[case_id_column]==id])
        if length>=self.config.seq_len:
            selected_id = id
            break
    if selected_id == -1: #: no sequence of appropiate length for input
        raise exceptions.SeqLengthTooHigh()             

    random_cut = random.randint(self.config.seq_len+1, length ) 
    dummy = df[df[case_id_column]==selected_id]

    return dummy.iloc[:random_cut]

jsonify_paths()

note that we just save the probability of the last pair (time, event) in the path, since the nn calculates lambda*(t), which is the probability of the last predicted event happening in the predicted time t.

paths markers are assumed to be decoded.

Source code in server/prediction_manager.py
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
def jsonify_paths(self):
    """
    note that we just save the
    probability of the last pair (time, event) in the path, 
    since the nn calculates lambda*(t), which is 
    the probability of the last predicted event happening
    in the predicted time t. 

    paths markers are assumed to be decoded. 
    """
    ans = {
        "paths": []
    }
    for path in self.decoded_paths :
        current_path = {
            'pairs':[], 
            'percentage': str(path[-1][1][0])
        }
        first_time, (first_percentage, first_event) = path[0]
        rest = path[1:]

        print(first_time)
        first_time = first_time*(10**self.config.exponent)
        #: rescale the timedeltas


        #: the -5 displacement is used to tackle the accuacy problem. it is set
        rest_times = [math.ceil(time*(10**(self.config.exponent-5))) for time, (prob, event) in rest] 
        for i in range(1,len(rest_times)-1):  #:calculate cumsum
            rest_times[i] = rest_times[i]+ rest_times[i-1] 

        #: calculate the timestamp based on the first event 
        rest_times = [time+first_time for time in rest_times]
        times= [first_time] + rest_times
        temp_df = pd.DataFrame({"times": times})
        temp_df["times"] = temp_df["times"].astype("datetime64[ns]") #: a dataframe is used because we want to revese the encoding 
        # with the same algorithm (in thie case provided by pandas)
        times_decoded = temp_df["times"].tolist()


        for path_element, decoded_time in zip(path, times_decoded):
            current_path["pairs"].append(
                {
                    "time":str(decoded_time) , 
                    "event": str(path_element[1][1]), 
                }
            )
        ans["paths"].append(current_path)
    return ans

jsonify_single(time_pred, event_pred, prob)

note that we just save the probability of the last pair (time, event) in the path, since the nn calculates lambda*(t) (see paper), which is the probability of the last predicted event happening in the predicted time t.

Source code in server/prediction_manager.py
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
def jsonify_single(self, time_pred, event_pred, prob): 
    """
    note that we just save the
    probability of the last pair (time, event) in the path, 
    since the nn calculates lambda*(t) (see paper), which is 
    the probability of the last predicted event happening
    in the predicted time t. 
    """
    # decode event
    decoded_event = self.config.activity_le.inverse_transform([event_pred])

    # decode the timestamp
    timestamps = self.encoded_df[self.timestamp_key].copy()


    timestamps = timestamps*(10**(self.config.exponent))

    #: this -5 displacement is used to tackle the accuacy problem. it is set
    # arbitrarily.
    time_pred = time_pred*(10**(self.config.exponent-5)) 

    timestamps.iloc[-1]= timestamps.iloc[-1]+ time_pred
    timestamps = timestamps.astype("datetime64[ns]")
    new_time = timestamps.iloc[-1]

    ans = {
        "predicted_time":str(new_time), 
        "predicted_event":decoded_event[0], 
        "probability": prob
    }
    return json.dumps(ans)

linear_iterative_predictor(depth, start_time, start_event)

makes predictions linearly (ie no backtracking and branching degree = 1) , and also iteratively (no recursion)

Source code in server/prediction_manager.py
230
231
232
233
234
235
236
237
238
239
240
241
242
243
def linear_iterative_predictor(self, depth, start_time, start_event): 
    """
    makes predictions linearly (ie no backtracking and branching degree = 1) , and also 
    iteratively (no recursion)
    """
    c_t = start_time
    c_e = start_event
    path = [(c_t , (1,c_e))]
    for i in range(depth): 
        p_t, p_events = self.get_sorted_wrapper()
        p_pair = p_events[0]
        path.append((p_t[0], (p_pair[0], p_pair[1] )))
        self.append_to_log(p_t[0], p_pair[1])
    self.paths.append(path)

linear_iterative_predictor_non_stop(start_time, start_event, upper=float('inf'))

Predicts the path of events iteratively until an end activity is found or the upper bound is reached.

Parameters:

Name Type Description Default
start_time float

The start time of the path.

required
start_event

The start event of the path.

required
upper float

The upper bound for the amount of iterations. Defaults to float("inf").

float('inf')
Source code in server/prediction_manager.py
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
def linear_iterative_predictor_non_stop(self, start_time, start_event, upper=float("inf")):
    """
    Predicts the path of events iteratively until an end activity is found or the upper bound is reached.

    Args:
        start_time (float): The start time of the path.
        start_event: The start event of the path.
        upper (float, optional): The upper bound for the amount of iterations. Defaults to float("inf").
    """
    c_t = start_time
    c_e = start_event
    path = [(c_t, (1, c_e))]
    i = 0
    while not self.end_activities[c_e] and i < upper:
        p_t, p_events = self.get_sorted_wrapper()
        p_pair = p_events[0]
        path.append((p_t[0], (p_pair[0], p_pair[1])))
        self.append_to_log(p_t[0], p_pair[1])
        c_t = p_t[0]
        c_e = p_pair[0]
        i += 1
    self.paths.append(path)

multiple_prediction(depth, degree)

Get a list of possible paths starting at the last timestamp and event pair.

Parameters:

Name Type Description Default
depth int

The number of steps in the future to be predicted.

required
degree int

The number of predictions on each step to be considered.

required

This method loads data, gets windows, computes paths, and decodes paths. It requires the configuration used for the NN, which is required by the ATM Dataset.

Source code in server/prediction_manager.py
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
def multiple_prediction(self, depth, degree): 
        """
        Get a list of possible paths starting at the last timestamp and event pair.

        Args:
            depth (int): The number of steps in the future to be predicted.
            degree (int): The number of predictions on each step to be considered.


        This method loads data, gets windows, computes paths, and decodes paths.
        It requires the configuration used for the NN, which is required by the ATM Dataset.
        """
        c_t = self.encoded_df[self.timestamp_key].iloc[-1]
        c_e = self.encoded_df[self.activity_key].iloc[-1]

        #:load data, get windows
        self.recursive_atm = RMTPP_torch.ATMDataset(self.config, self.encoded_df, self.case_id_key, self.timestamp_key, self.activity_key, True)
        self.recursive_time_seqs = self.recursive_atm.time_seqs
        self.recursive_event_seqs = self.recursive_atm.event_seqs

        #:get differences
        self.recursive_time_diffs = self.get_differences()

        #: compute paths
        self.backtracking_prediction_tree(c_t, c_e, 0, depth, degree, [(c_t, (1, c_e))]) 

        #: decode paths
        self.decode_paths()

multiple_prediction_dataframe(depth, degree, df, linear=False, non_stop=False, upper=30)

make multiple predictions given a dataframe preprocessor is in charge of doing the reading/importing from csv, xes, commandline, etc... it is assumed that the event log contains only one case id.

Source code in server/prediction_manager.py
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
def multiple_prediction_dataframe(self, depth, degree, df, linear=False, non_stop = False, upper = 30):
    """
    make multiple predictions given a dataframe
    preprocessor is in charge of doing the
    reading/importing from csv, xes, commandline, etc...
    it is assumed that the event log contains only one case id.
    """

    """
    preprocessor = preprocessing.Preprocessing()

    #; its important to pass the existing encoders to avoid recomputation of the language encoders!

    preprocessor.import_event_log_dataframe(df, self.case_id_key, self.activity_key, self.timestamp_key)

    self.encoded_df= preprocessor.event_df 
    """
    self.encoded_df = df
    self.current_case_id= self.encoded_df[self.case_id_key].sample(n = 1).values[0]


    di = self.config.encoder_to_dict(self.config.activity_le)
    if not linear: 
        """
        case backtracking needed; never called by prediction manager.  
        """
        self.multiple_prediction(depth, degree)
    else: 
        """
        the linear version is only being used for
        the iterative use of the multiple prediction generation.
        it mostly contains optimizations that are only 
        tought for the prediction manager. 
        """
        self.multiple_prediction_linear(depth, non_stop, upper)

multiple_prediction_linear(depth, nonstop, upper)

this is a special case of multiple prediction where the degree= 1. we avoid backtracking and recursion for efficiency reasons.

Source code in server/prediction_manager.py
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
def multiple_prediction_linear(self, depth, nonstop ,upper): 
    """
    this is a special case of multiple prediction
    where the degree= 1. we avoid backtracking and recursion for
    efficiency reasons.  
    """
    #: get the event and timestamp of the last event in the partial process.
    c_t =self.encoded_df[self.timestamp_key].iloc[-1]
    c_e =self.encoded_df[self.activity_key].iloc[-1]

    #: compute sliding window
    self.recursive_atm = RMTPP_torch.ATMDataset(self.config, self.encoded_df, self.case_id_key, self.timestamp_key, self.activity_key, True)
    self.recursive_time_seqs = self.recursive_atm.time_seqs
    self.recursive_event_seqs = self.recursive_atm.event_seqs

    #: compute differences within each window
    self.recursive_time_diffs= self.get_differences()


    if nonstop:
        #: in case that we run until end activity
        self.linear_iterative_predictor_non_stop(c_t, c_e, upper)
    else:
        #: in case we run until a given depth
        self.linear_iterative_predictor(depth, c_t, c_e)

pop_from_log()

used for backtracking to restore the old path

Source code in server/prediction_manager.py
342
343
344
345
346
347
348
def pop_from_log(self): 
    """
    used for backtracking to restore the old path
    """
    self.recursive_time_seqs.pop()
    self.recursive_event_seqs.pop()
    self.recursive_time_diffs= np.delete(self.recursive_time_diffs, len(self.recursive_time_diffs)-1, axis = 0) 

single_prediction()

make one prediction given a partial process.

Source code in server/prediction_manager.py
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
def single_prediction(self):
    """
    make one prediction given a partial process. 
    """
    #: sliding window

    step1 = RMTPP_torch.ATMDataset(self.config,self.encoded_df, self.case_id_key, self.timestamp_key, self.activity_key)
    #: just create one batch
    step2 = DataLoader(step1, batch_size=len(step1.time_seqs), shuffle=False, collate_fn=RMTPP_torch.ATMDataset.to_features)

    #: get the batch
    batch = next(iter(step2))
    event_pred, prob, time_pred= self.model.predict(batch, pm_active = True)

    return time_pred, event_pred, prob

single_prediction_dataframe(df)

make one prediction given a dataframe. preprocessor is in charge of doing the reading/importing from csv, xes, commandline, etc...

Source code in server/prediction_manager.py
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
def single_prediction_dataframe(self, df):
    """
    make one prediction given a dataframe. 
    preprocessor is in charge of doing the
    reading/importing from csv, xes, commandline, etc...
    """
    self.encoded_df = df

    #: check case id uniqueness
    if not self.check_input_uniqueness():
        raise exceptions.NotOneCaseId()
    self.current_case_id = self.encoded_df[self.case_id_key].sample(n = 1).values[0]
    return self.single_prediction()

Process model manager module

This module implements all necessary functions for conformance checking and fitness analysis.

Functions:

Name Description
- cut_event_log_tail

Cuts each case in the event log from the tail.

- cut_event_log_random

Cuts each case in the event log at random indices.

- reconstruct_event_log

Reconstructs the event log using the prediction manager.

- process_mining

Applies a process mining algorithm to the reconstructed event log.

- conformance_checking_token_based

Performs token-based conformance checking on the reconstructed event log.

- conformance_checking_alignment_based

Performs alignment-based conformance checking on the reconstructed event log.

- import_petri_net

Imports a Petri net.

- export_petri_net

Exports a Petri net.

- decode_predictions

Decodes the predictions in the event log.

This module allows for the analysis of fitness by cutting the event log, reconstructing it using predictions, and applying process mining and conformance checking algorithms.

ProcessModelManager

Source code in server/process_model_manager.py
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
class ProcessModelManager:
    def __init__(self,event_df, model, config, case_activity_key, case_id_key, case_timestamp_key  ):
        #: it is assumed that the model already exists.
        self.model =model  #genertaed model
        self.case_activity_key = case_activity_key
        self.case_id_key = case_id_key
        self.case_timestamp_key= case_timestamp_key 
        self.config =config  #config of the nn
        self.event_df =event_df  #the df use for preprocessing and training 

        self.end_activities = {}
        self.predictive_df= None
        # variables to store PM model for further Conformance checking
        self.initial_marking = None
        self.unencoded_df = None #used for conformance checking
        self.final_marking = None
        self.petri_net = None


    def initialize_variables(self):
        """
        initialize variabels for predictive log generator
        """
        case_id_counts = self.event_df[self.case_id_key].value_counts()
        cuts = []
        self.predictive_df = {
            self.case_id_key:[],
            self.case_activity_key:[],
            self.case_timestamp_key:[]
        }
        input_sequences = []
        cuts = {}
        return case_id_counts, cuts, input_sequences, cuts



    def tail_cutter(self, case_id_counts, cut_length, cuts, input_sequences):
        """
        cut sequences cut_length steps from the tail.
        :param cut_length: how many steps to cut from the tail of each sequence. 
        :param case_id_counts: number of steps on each case_id
        :param input_sequences: list of sequences to be cut. 

        Side effect: the predictive_df is extended with the cut sequences.
        """
        for case_id in case_id_counts.index:
            count = case_id_counts.loc[case_id]
            cut = random.randint(1, count)
            cut = count-min(cut_length, cut)
            sequence = self.event_df[self.event_df[self.case_id_key]==case_id]
            sequence = sequence.iloc[:cut]
            if len(sequence) <= self.config.seq_len:
                continue
            cuts[case_id]= (count, cut, count-cut)
            input_sequences.append(sequence)

            self.predictive_df= pd.concat([self.predictive_df, sequence], ignore_index = True)

        return case_id_counts, cuts, input_sequences

    def random_cutter(self, case_id_counts, max_len, cuts, input_sequences):
        """
        Cuts each sequence contained in input_sequences at random indices.

        Args:
            cuts (dict): The cut index and cut length are preserved.
            case_id_counts (pd.Series): Number of rows for each case_id.
            max_len (int): Max length that the input sequence can have. Can be set to improve runtime.
                   TODO: allow INF for max_len.
            input_sequences (list): List of sequences to be cut.
        """
        for i, case_id in enumerate(case_id_counts.index):
            count = case_id_counts.loc[case_id]
            sequence = self.event_df[self.event_df[self.case_id_key]==case_id] 
            if count<=self.config.seq_len or count>max_len:
                continue
            cut = random.randint(self.config.seq_len+1, count)
            sequence = sequence.iloc[:cut]
            cuts[case_id]= (count, cut, count-cut)
            input_sequences.append(sequence)
            self.predictive_df= pd.concat([self.predictive_df, sequence], ignore_index = True)
        return case_id_counts, cuts, input_sequences 

    def fill_up_log(self, upper , non_stop , random_cuts , cut_length , input_sequences, cuts):
        """
        do the predictions for each cut sequence and extend the event log so that 
        it now constains the predictions. 
        """
        #: initialize prediction manager.
        pm=prediction_manager.PredictionManager(
            self.model, 
            self.case_id_key, 
            self.case_activity_key, 
            self.case_timestamp_key, 
            self.config
        )
        pm.end_activities = self.end_activities

        for i, sequence in enumerate(tqdm(input_sequences)): 
            case_id = sequence[self.case_id_key].iloc[1]
            #: do the predictions in the corresponding mode
            pm.multiple_prediction_dataframe(
                cuts[case_id][2],
                1,
                sequence,
                linear = True,  
                non_stop=non_stop,
                upper = upper,
            )
            prediction = pm.paths[0] 
            extension = {
                self.case_id_key:[],
                self.case_activity_key:[],
                self.case_timestamp_key:[]
            }
            #: arrange the predictions in the extension dictionary
            # the first prediction is not used because it is just information 
            # use for knowing what was the last timestamp recorded
            for time, (pred, event) in prediction[1:]:
                extension[self.case_id_key] = [case_id]
                extension[self.case_activity_key]= [event]
                extension[self.case_timestamp_key]= [time]

            extension = pd.DataFrame(extension)
            #: here compute cumulative sum of times + last timestamp recorded



            extension[self.case_timestamp_key] = extension[self.case_timestamp_key]*self.config.exponent
            extension[self.case_timestamp_key] = extension[self.case_timestamp_key].cumsum()
            extension[self.case_timestamp_key] = extension[self.case_timestamp_key] + prediction[0][0]

            #: transform extension to dtaframe and extend the predictive df now with the predictions
            self.predictive_df= pd.concat([self.predictive_df, extension], ignore_index = True)

        self.predictive_df=  self.predictive_df.sort_values(by=[self.case_id_key, self.case_timestamp_key])
        self.predictive_df = self.decode_df(self.predictive_df)

    def generate_predictive_log(self, new_log_path, max_len= 15, upper = 30, non_stop = False, random_cuts = False, cut_length = 0): 
        """
        generates a predictive log. each process is cut at some given index, and the model is used to 
        reconstruct the rest of the process. there are so far three possible modi for cutting and prediction generation:  
        - for tail cuts: set cut_length value and set random_cuts to false
        - for random cuts with cut memory: random_cuts to true and non_stop to false
        - for random cuts nonstop: random_cuts to true and non_stop totrue 

        Args:
            max len: max length for the cut sequences ie max sequence input size length.
            upper:  upperbound for the non stop random cutter ie how long to run before reaching end state. 
            non_stop: must be set to true if the predictions are done until reaching final marking.
            random_cuts: set to true to cut in random indices. 
            cut_length: in case of cutting fix tail lengths, select the tail length to cut for all sequences.
            upper: upper bound for how many iterations a non stop iterative predictor should run.
        """

        case_id_counts, cuts, input_sequences, cuts = self.initialize_variables()

        self.predictive_df = pd.DataFrame(self.predictive_df)
        if random_cuts: 
            case_id_counts,cuts, input_sequences= self.random_cutter(case_id_counts, max_len,cuts,  input_sequences)
        else: 
            if cut_length ==0: 
                raise exceptions.CutLengthZero()
            case_id_counts,cuts, input_sequences= self.tail_cutter(case_id_counts, cut_length,cuts,  input_sequences)


        self.check_too_short(input_sequences)

        self.fill_up_log( upper , non_stop , random_cuts , cut_length , input_sequences, cuts)

        self.predictive_df.to_csv(new_log_path, sep = ",")

    def check_too_short(self, sequences):
        lenths = [len(seq) for seq in sequences]
        for i in lenths: 
            if i<=self.config.seq_len: 
                print("found too short sequence")
                raise exceptions.CutTooLarge()

    def decode_sequence(self, sequence):
        """
        decodes the input sequence that contains a df.  
        :return: sequence that has been decoded. 
        """
        sequence[self.case_activity_key] = self.config.activity_le.inverse_transform(sequence[self.case_activity_key].astype(int))
        return sequence

    def handle_nat(self, group):
        """
        the inverse transformation for timestamps is a lossy transformation and might lead to NaT entries. 
        a timedelta of k second's with respect to the last valid timestamp is set as a timestamp value for 
        the kth NaT entry.
        :param group: a group in the predictive df that contains only one case id. 
        :return: the same group now with valid timestamps
        """
        last_valid_idx = group[self.case_timestamp_key].last_valid_index()
        if last_valid_idx is None:
            return group
        print("found nan")
        last_valid_timestamp= group.loc[last_valid_idx, self.case_timestamp_key]

        nat_indices = group.index[group[self.case_timestamp_key].isna()]
        for i, idx in enumerate(nat_indices):
            group.at[idx, self.case_timestamp_key] = last_valid_timestamp+ pd.Timedelta(days=i + 1)

        return group     

    def decode_df(self, df):
        """
        decodes the predictive df; inverse transform timestamps and event names.
        """

        df[self.case_activity_key] = df[self.case_activity_key].astype("int")
        df[self.case_id_key] = df[self.case_id_key].astype("int")
        df[self.case_activity_key] = self.config.activity_le.inverse_transform(df[self.case_activity_key])
        df[self.case_id_key] = self.config.case_id_le.inverse_transform(df[self.case_id_key])
        df[self.case_activity_key] = df[self.case_activity_key].astype("str")
        df[self.case_id_key] = df[self.case_id_key].astype("str")
        #: note that this operation is lossy and might generate NaT. 

        df[self.case_timestamp_key] = df[self.case_timestamp_key]*(10**self.config.exponent)

        df[self.case_activity_key] = df[self.case_activity_key].astype("str")
        df[self.case_id_key] = df[self.case_id_key].astype("str")

        df[self.case_timestamp_key] = df[self.case_timestamp_key].astype("datetime64[ns, UTC]")

        #: handle NaT values
        df= df.groupby(self.case_id_key, group_keys=False).apply(self.handle_nat)
        #: just in case 
        df = df.dropna() 
        #: save the generated predictive model
        return df

    def import_predictive_df(self, path):
        """
        used for importing a predictive df. 
        """
        self.predictive_df = pd.read_csv(path, sep = ",")

    def visualize(self):
        #: this way it can be accessed from outside the class.
        pm4py.view_petri_net(self.petri_net, self.initial_marking, self.final_marking, format='svg')

    def heuristic_miner(self,path,  dependency_threshold=0.5, and_threshold=0.65, loop_two_threshold=0.5, view= False):
        """
        Run heuristic miner on the predictive log and generate a petri net.

        Args:
            path (str): Path used for saving the generated petri net.
            dependency_threshold (float): Dependency threshold parameter for heuristic miner.
            and_threshold (float): AND threshold parameter for heuristic miner.
            loop_two_threshold (float): Loop two threshold parameter for heuristic miner.
        """
        self.format_columns()
        self.petri_net, self.initial_marking, self.final_marking = pm4py.discover_petri_net_heuristics(
            self.predictive_df,
            dependency_threshold, 
            and_threshold, 
            loop_two_threshold, 
            activity_key=self.case_activity_key,
            timestamp_key=self.case_timestamp_key,
            case_id_key= self.case_id_key
        )
        #: export the petri net in the given path
        pm4py.write_pnml(self.petri_net,self.initial_marking, self.final_marking, file_path=path)
        pm4py.save_vis_petri_net(self.petri_net, self.initial_marking, self.final_marking, file_path = path+".png")

    def format_columns(self): 
        """
        exporting to csv changes the datetime types to object, but we need them to be 
        datetime.  
        """
        self.predictive_df[self.case_timestamp_key] = self.predictive_df[self.case_timestamp_key].astype("datetime64[ns, UTC]")
        self.predictive_df[self.case_id_key] = self.predictive_df[self.case_id_key].astype("str")
        #self.predictive_df[self.case_activity_key] = self.config.activity_le.inverse_transform(self.predictive_df[self.case_activity_key].astype(int))
        self.predictive_df[self.case_activity_key] = self.predictive_df[self.case_activity_key].astype("str")

    def inductive_miner(self, path,   noise_threshold=0):
        """
        Run inductive miner on the predictive log and generate a petri net.

        Args:
            path (str): Path used for saving the generated petri net.
            noise_threshold (float): Noise threshold parameter for inductive miner.
        """
        self.format_columns()
        self.petri_net, self.initial_marking, self.final_marking = pm4py.discover_petri_net_inductive(
            self.predictive_df,
            noise_threshold, 
            self.case_activity_key,
            self.case_timestamp_key,
            self.case_id_key
        )
        #pm4py.view_petri_net(self.petri_net, self.initial_marking, self.final_marking, format='svg')
        pm4py.write_pnml(self.petri_net,self.initial_marking, self.final_marking, file_path=path)
        pm4py.save_vis_petri_net(self.petri_net, self.initial_marking, self.final_marking, file_path = path+".png")

    def alpha_miner(self, path):
        """
        Run alpha miner on the predictive log and generate a petri net.

        Args:
            path (str): Path used for saving the generated petri net.
        """
        self.format_columns()
        self.petri_net, self.initial_marking, self.final_marking = pm4py.discover_petri_net_alpha(
            self.predictive_df,
            self.case_activity_key,
            self.case_timestamp_key, 
            self.case_id_key
        )
        #pm4py.view_petri_net(self.petri_net, self.initial_marking, self.final_marking, format='svg')
        pm4py.write_pnml(self.petri_net,self.initial_marking, self.final_marking , file_path=path)
        pm4py.save_vis_petri_net(self.petri_net, self.initial_marking, self.final_marking, file_path = path+".png")

    def prefix_tree_miner(self, path):
        """
        Run prefix tree miner on the predictive log and generate a petri net.

        Args:
            path (str): Path used for saving the generated petri net.
        """
        self.format_columns()
        self.petri_net, self.initial_marking, self.final_marking = pm4py.discover_prefix_tree(
            self.predictive_df,
            self.case_activity_key,
            self.case_timestamp_key,
            self.case_id_key
        )
        #pm4py.view_petri_net(self.petri_net, self.initial_marking, self.final_marking, format='svg')
        pm4py.write_pnml(self.petri_net,self.initial_marking, self.final_marking , file_path=path)
        pm4py.save_vis_petri_net(self.petri_net, self.initial_marking, self.final_marking, file_path = path+".png")


    def conformance_checking_token_based_replay(self):
        replayed_traces = pm4py.conformance_diagnostics_token_based_replay(
            self.unencoded_df,  self.petri_net, self.initial_marking, self.final_marking)
        return self.compute_fitness(replayed_traces)

    def compute_fitness(self, replayed_traces):
        sum_m =  0
        sum_c = 0
        sum_r = 0
        sum_p  = 0
        # TODO: multiply by trace frequency in the log (there is no such function in pm4py)
        for trace in replayed_traces: 
            sum_m+= 1*trace["missing_tokens"]
            sum_c+= 1*trace["consumed_tokens"]
            sum_r += 1*trace["remaining_tokens"]
            sum_p += 1*trace["produced_tokens"]

        return 0.5*(1-(sum_m/sum_c)) + 0.5*(1-(sum_r/sum_p))

    def conformance_checking_alignments(self):
        aligned_traces = pm4py.conformance_diagnostics_alignments(self.unencoded_df, self.petri_net, self.initial_marking, self.final_marking)
        log_fitness = replay_fitness.evaluate(aligned_traces, variant=replay_fitness.Variants.ALIGNMENT_BASED)
        return self.compute_fitness(log_fitness)

    def load_petri_net(self, path): 
        self.petri_net, self.initial_marking, self.final_marking = pm4py.read_pnml(path)

alpha_miner(path)

Run alpha miner on the predictive log and generate a petri net.

Parameters:

Name Type Description Default
path str

Path used for saving the generated petri net.

required
Source code in server/process_model_manager.py
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
def alpha_miner(self, path):
    """
    Run alpha miner on the predictive log and generate a petri net.

    Args:
        path (str): Path used for saving the generated petri net.
    """
    self.format_columns()
    self.petri_net, self.initial_marking, self.final_marking = pm4py.discover_petri_net_alpha(
        self.predictive_df,
        self.case_activity_key,
        self.case_timestamp_key, 
        self.case_id_key
    )
    #pm4py.view_petri_net(self.petri_net, self.initial_marking, self.final_marking, format='svg')
    pm4py.write_pnml(self.petri_net,self.initial_marking, self.final_marking , file_path=path)
    pm4py.save_vis_petri_net(self.petri_net, self.initial_marking, self.final_marking, file_path = path+".png")

decode_df(df)

decodes the predictive df; inverse transform timestamps and event names.

Source code in server/process_model_manager.py
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
def decode_df(self, df):
    """
    decodes the predictive df; inverse transform timestamps and event names.
    """

    df[self.case_activity_key] = df[self.case_activity_key].astype("int")
    df[self.case_id_key] = df[self.case_id_key].astype("int")
    df[self.case_activity_key] = self.config.activity_le.inverse_transform(df[self.case_activity_key])
    df[self.case_id_key] = self.config.case_id_le.inverse_transform(df[self.case_id_key])
    df[self.case_activity_key] = df[self.case_activity_key].astype("str")
    df[self.case_id_key] = df[self.case_id_key].astype("str")
    #: note that this operation is lossy and might generate NaT. 

    df[self.case_timestamp_key] = df[self.case_timestamp_key]*(10**self.config.exponent)

    df[self.case_activity_key] = df[self.case_activity_key].astype("str")
    df[self.case_id_key] = df[self.case_id_key].astype("str")

    df[self.case_timestamp_key] = df[self.case_timestamp_key].astype("datetime64[ns, UTC]")

    #: handle NaT values
    df= df.groupby(self.case_id_key, group_keys=False).apply(self.handle_nat)
    #: just in case 
    df = df.dropna() 
    #: save the generated predictive model
    return df

decode_sequence(sequence)

decodes the input sequence that contains a df.
:return: sequence that has been decoded.

Source code in server/process_model_manager.py
205
206
207
208
209
210
211
def decode_sequence(self, sequence):
    """
    decodes the input sequence that contains a df.  
    :return: sequence that has been decoded. 
    """
    sequence[self.case_activity_key] = self.config.activity_le.inverse_transform(sequence[self.case_activity_key].astype(int))
    return sequence

fill_up_log(upper, non_stop, random_cuts, cut_length, input_sequences, cuts)

do the predictions for each cut sequence and extend the event log so that it now constains the predictions.

Source code in server/process_model_manager.py
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
def fill_up_log(self, upper , non_stop , random_cuts , cut_length , input_sequences, cuts):
    """
    do the predictions for each cut sequence and extend the event log so that 
    it now constains the predictions. 
    """
    #: initialize prediction manager.
    pm=prediction_manager.PredictionManager(
        self.model, 
        self.case_id_key, 
        self.case_activity_key, 
        self.case_timestamp_key, 
        self.config
    )
    pm.end_activities = self.end_activities

    for i, sequence in enumerate(tqdm(input_sequences)): 
        case_id = sequence[self.case_id_key].iloc[1]
        #: do the predictions in the corresponding mode
        pm.multiple_prediction_dataframe(
            cuts[case_id][2],
            1,
            sequence,
            linear = True,  
            non_stop=non_stop,
            upper = upper,
        )
        prediction = pm.paths[0] 
        extension = {
            self.case_id_key:[],
            self.case_activity_key:[],
            self.case_timestamp_key:[]
        }
        #: arrange the predictions in the extension dictionary
        # the first prediction is not used because it is just information 
        # use for knowing what was the last timestamp recorded
        for time, (pred, event) in prediction[1:]:
            extension[self.case_id_key] = [case_id]
            extension[self.case_activity_key]= [event]
            extension[self.case_timestamp_key]= [time]

        extension = pd.DataFrame(extension)
        #: here compute cumulative sum of times + last timestamp recorded



        extension[self.case_timestamp_key] = extension[self.case_timestamp_key]*self.config.exponent
        extension[self.case_timestamp_key] = extension[self.case_timestamp_key].cumsum()
        extension[self.case_timestamp_key] = extension[self.case_timestamp_key] + prediction[0][0]

        #: transform extension to dtaframe and extend the predictive df now with the predictions
        self.predictive_df= pd.concat([self.predictive_df, extension], ignore_index = True)

    self.predictive_df=  self.predictive_df.sort_values(by=[self.case_id_key, self.case_timestamp_key])
    self.predictive_df = self.decode_df(self.predictive_df)

format_columns()

exporting to csv changes the datetime types to object, but we need them to be datetime.

Source code in server/process_model_manager.py
294
295
296
297
298
299
300
301
302
def format_columns(self): 
    """
    exporting to csv changes the datetime types to object, but we need them to be 
    datetime.  
    """
    self.predictive_df[self.case_timestamp_key] = self.predictive_df[self.case_timestamp_key].astype("datetime64[ns, UTC]")
    self.predictive_df[self.case_id_key] = self.predictive_df[self.case_id_key].astype("str")
    #self.predictive_df[self.case_activity_key] = self.config.activity_le.inverse_transform(self.predictive_df[self.case_activity_key].astype(int))
    self.predictive_df[self.case_activity_key] = self.predictive_df[self.case_activity_key].astype("str")

generate_predictive_log(new_log_path, max_len=15, upper=30, non_stop=False, random_cuts=False, cut_length=0)

generates a predictive log. each process is cut at some given index, and the model is used to reconstruct the rest of the process. there are so far three possible modi for cutting and prediction generation:
- for tail cuts: set cut_length value and set random_cuts to false - for random cuts with cut memory: random_cuts to true and non_stop to false - for random cuts nonstop: random_cuts to true and non_stop totrue

Parameters:

Name Type Description Default
max len

max length for the cut sequences ie max sequence input size length.

required
upper

upperbound for the non stop random cutter ie how long to run before reaching end state.

30
non_stop

must be set to true if the predictions are done until reaching final marking.

False
random_cuts

set to true to cut in random indices.

False
cut_length

in case of cutting fix tail lengths, select the tail length to cut for all sequences.

0
upper

upper bound for how many iterations a non stop iterative predictor should run.

30
Source code in server/process_model_manager.py
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
def generate_predictive_log(self, new_log_path, max_len= 15, upper = 30, non_stop = False, random_cuts = False, cut_length = 0): 
    """
    generates a predictive log. each process is cut at some given index, and the model is used to 
    reconstruct the rest of the process. there are so far three possible modi for cutting and prediction generation:  
    - for tail cuts: set cut_length value and set random_cuts to false
    - for random cuts with cut memory: random_cuts to true and non_stop to false
    - for random cuts nonstop: random_cuts to true and non_stop totrue 

    Args:
        max len: max length for the cut sequences ie max sequence input size length.
        upper:  upperbound for the non stop random cutter ie how long to run before reaching end state. 
        non_stop: must be set to true if the predictions are done until reaching final marking.
        random_cuts: set to true to cut in random indices. 
        cut_length: in case of cutting fix tail lengths, select the tail length to cut for all sequences.
        upper: upper bound for how many iterations a non stop iterative predictor should run.
    """

    case_id_counts, cuts, input_sequences, cuts = self.initialize_variables()

    self.predictive_df = pd.DataFrame(self.predictive_df)
    if random_cuts: 
        case_id_counts,cuts, input_sequences= self.random_cutter(case_id_counts, max_len,cuts,  input_sequences)
    else: 
        if cut_length ==0: 
            raise exceptions.CutLengthZero()
        case_id_counts,cuts, input_sequences= self.tail_cutter(case_id_counts, cut_length,cuts,  input_sequences)


    self.check_too_short(input_sequences)

    self.fill_up_log( upper , non_stop , random_cuts , cut_length , input_sequences, cuts)

    self.predictive_df.to_csv(new_log_path, sep = ",")

handle_nat(group)

the inverse transformation for timestamps is a lossy transformation and might lead to NaT entries. a timedelta of k second's with respect to the last valid timestamp is set as a timestamp value for the kth NaT entry. :param group: a group in the predictive df that contains only one case id. :return: the same group now with valid timestamps

Source code in server/process_model_manager.py
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
def handle_nat(self, group):
    """
    the inverse transformation for timestamps is a lossy transformation and might lead to NaT entries. 
    a timedelta of k second's with respect to the last valid timestamp is set as a timestamp value for 
    the kth NaT entry.
    :param group: a group in the predictive df that contains only one case id. 
    :return: the same group now with valid timestamps
    """
    last_valid_idx = group[self.case_timestamp_key].last_valid_index()
    if last_valid_idx is None:
        return group
    print("found nan")
    last_valid_timestamp= group.loc[last_valid_idx, self.case_timestamp_key]

    nat_indices = group.index[group[self.case_timestamp_key].isna()]
    for i, idx in enumerate(nat_indices):
        group.at[idx, self.case_timestamp_key] = last_valid_timestamp+ pd.Timedelta(days=i + 1)

    return group     

heuristic_miner(path, dependency_threshold=0.5, and_threshold=0.65, loop_two_threshold=0.5, view=False)

Run heuristic miner on the predictive log and generate a petri net.

Parameters:

Name Type Description Default
path str

Path used for saving the generated petri net.

required
dependency_threshold float

Dependency threshold parameter for heuristic miner.

0.5
and_threshold float

AND threshold parameter for heuristic miner.

0.65
loop_two_threshold float

Loop two threshold parameter for heuristic miner.

0.5
Source code in server/process_model_manager.py
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
def heuristic_miner(self,path,  dependency_threshold=0.5, and_threshold=0.65, loop_two_threshold=0.5, view= False):
    """
    Run heuristic miner on the predictive log and generate a petri net.

    Args:
        path (str): Path used for saving the generated petri net.
        dependency_threshold (float): Dependency threshold parameter for heuristic miner.
        and_threshold (float): AND threshold parameter for heuristic miner.
        loop_two_threshold (float): Loop two threshold parameter for heuristic miner.
    """
    self.format_columns()
    self.petri_net, self.initial_marking, self.final_marking = pm4py.discover_petri_net_heuristics(
        self.predictive_df,
        dependency_threshold, 
        and_threshold, 
        loop_two_threshold, 
        activity_key=self.case_activity_key,
        timestamp_key=self.case_timestamp_key,
        case_id_key= self.case_id_key
    )
    #: export the petri net in the given path
    pm4py.write_pnml(self.petri_net,self.initial_marking, self.final_marking, file_path=path)
    pm4py.save_vis_petri_net(self.petri_net, self.initial_marking, self.final_marking, file_path = path+".png")

import_predictive_df(path)

used for importing a predictive df.

Source code in server/process_model_manager.py
260
261
262
263
264
def import_predictive_df(self, path):
    """
    used for importing a predictive df. 
    """
    self.predictive_df = pd.read_csv(path, sep = ",")

inductive_miner(path, noise_threshold=0)

Run inductive miner on the predictive log and generate a petri net.

Parameters:

Name Type Description Default
path str

Path used for saving the generated petri net.

required
noise_threshold float

Noise threshold parameter for inductive miner.

0
Source code in server/process_model_manager.py
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
def inductive_miner(self, path,   noise_threshold=0):
    """
    Run inductive miner on the predictive log and generate a petri net.

    Args:
        path (str): Path used for saving the generated petri net.
        noise_threshold (float): Noise threshold parameter for inductive miner.
    """
    self.format_columns()
    self.petri_net, self.initial_marking, self.final_marking = pm4py.discover_petri_net_inductive(
        self.predictive_df,
        noise_threshold, 
        self.case_activity_key,
        self.case_timestamp_key,
        self.case_id_key
    )
    #pm4py.view_petri_net(self.petri_net, self.initial_marking, self.final_marking, format='svg')
    pm4py.write_pnml(self.petri_net,self.initial_marking, self.final_marking, file_path=path)
    pm4py.save_vis_petri_net(self.petri_net, self.initial_marking, self.final_marking, file_path = path+".png")

initialize_variables()

initialize variabels for predictive log generator

Source code in server/process_model_manager.py
45
46
47
48
49
50
51
52
53
54
55
56
57
58
def initialize_variables(self):
    """
    initialize variabels for predictive log generator
    """
    case_id_counts = self.event_df[self.case_id_key].value_counts()
    cuts = []
    self.predictive_df = {
        self.case_id_key:[],
        self.case_activity_key:[],
        self.case_timestamp_key:[]
    }
    input_sequences = []
    cuts = {}
    return case_id_counts, cuts, input_sequences, cuts

prefix_tree_miner(path)

Run prefix tree miner on the predictive log and generate a petri net.

Parameters:

Name Type Description Default
path str

Path used for saving the generated petri net.

required
Source code in server/process_model_manager.py
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
def prefix_tree_miner(self, path):
    """
    Run prefix tree miner on the predictive log and generate a petri net.

    Args:
        path (str): Path used for saving the generated petri net.
    """
    self.format_columns()
    self.petri_net, self.initial_marking, self.final_marking = pm4py.discover_prefix_tree(
        self.predictive_df,
        self.case_activity_key,
        self.case_timestamp_key,
        self.case_id_key
    )
    #pm4py.view_petri_net(self.petri_net, self.initial_marking, self.final_marking, format='svg')
    pm4py.write_pnml(self.petri_net,self.initial_marking, self.final_marking , file_path=path)
    pm4py.save_vis_petri_net(self.petri_net, self.initial_marking, self.final_marking, file_path = path+".png")

random_cutter(case_id_counts, max_len, cuts, input_sequences)

Cuts each sequence contained in input_sequences at random indices.

Parameters:

Name Type Description Default
cuts dict

The cut index and cut length are preserved.

required
case_id_counts Series

Number of rows for each case_id.

required
max_len int

Max length that the input sequence can have. Can be set to improve runtime. TODO: allow INF for max_len.

required
input_sequences list

List of sequences to be cut.

required
Source code in server/process_model_manager.py
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
def random_cutter(self, case_id_counts, max_len, cuts, input_sequences):
    """
    Cuts each sequence contained in input_sequences at random indices.

    Args:
        cuts (dict): The cut index and cut length are preserved.
        case_id_counts (pd.Series): Number of rows for each case_id.
        max_len (int): Max length that the input sequence can have. Can be set to improve runtime.
               TODO: allow INF for max_len.
        input_sequences (list): List of sequences to be cut.
    """
    for i, case_id in enumerate(case_id_counts.index):
        count = case_id_counts.loc[case_id]
        sequence = self.event_df[self.event_df[self.case_id_key]==case_id] 
        if count<=self.config.seq_len or count>max_len:
            continue
        cut = random.randint(self.config.seq_len+1, count)
        sequence = sequence.iloc[:cut]
        cuts[case_id]= (count, cut, count-cut)
        input_sequences.append(sequence)
        self.predictive_df= pd.concat([self.predictive_df, sequence], ignore_index = True)
    return case_id_counts, cuts, input_sequences 

tail_cutter(case_id_counts, cut_length, cuts, input_sequences)

cut sequences cut_length steps from the tail. :param cut_length: how many steps to cut from the tail of each sequence. :param case_id_counts: number of steps on each case_id :param input_sequences: list of sequences to be cut.

Side effect: the predictive_df is extended with the cut sequences.

Source code in server/process_model_manager.py
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
def tail_cutter(self, case_id_counts, cut_length, cuts, input_sequences):
    """
    cut sequences cut_length steps from the tail.
    :param cut_length: how many steps to cut from the tail of each sequence. 
    :param case_id_counts: number of steps on each case_id
    :param input_sequences: list of sequences to be cut. 

    Side effect: the predictive_df is extended with the cut sequences.
    """
    for case_id in case_id_counts.index:
        count = case_id_counts.loc[case_id]
        cut = random.randint(1, count)
        cut = count-min(cut_length, cut)
        sequence = self.event_df[self.event_df[self.case_id_key]==case_id]
        sequence = sequence.iloc[:cut]
        if len(sequence) <= self.config.seq_len:
            continue
        cuts[case_id]= (count, cut, count-cut)
        input_sequences.append(sequence)

        self.predictive_df= pd.concat([self.predictive_df, sequence], ignore_index = True)

    return case_id_counts, cuts, input_sequences