Backend documentation

Preprocessing module

this module is in charge of

supporting event log imports from xes/csv files.
formatting the event log so that it can be later on used by the nn_manager module. in particular, the timestamps are encoded as integers, the case id's and activity names are encoded, and the rows are sorted by case id and timestamp. Splitting the event log in training and testing sublogs is also supported.
the preprocessor also calculates important values such as the number of activities and absolute frequency distribution, which are also required by the neural network's training.
formatting is done automatically after importing, but this can also be deselected by setting the corresponding parameter.
other preprocessing operations are supported, such as replacing NaN values, adding a unique start / end activity to the log, and removing duplicate rows.

Note that this module does not bring the event log in the input format for the RNN. this is done by the module util.py in the subpackage RMTPP_torch.

`Preprocessing`

This is the preprocessing unit for our server, which implements all the above mentioned functionalities.

Source code in server/preprocessing.py

class Preprocessing: 
    """
    This is the preprocessing unit for our server, which implements all the above mentioned functionalities.
    """
    def __init__(self):
        self.time_precision = None

        #: contains the event log path
        self.event_log_path = None
        self.event_log= None
        self.case_id_key = None 
        self.case_activity_key = None 
        self.case_timestamp_key = None 

        self.event_df = None #: dataframe containing the event log. corresponds to the imported log and eventually also the formatted one
        self.number_classes=0
        self.absolute_frequency_distribution = None
        self.case_id_le = None
        self.activity_le = None
        self.exponent = None
        self.unencoded_df = None



    def xes_helper(self, path): 
        """just a testing function"""
        log =pm4py.read.read_xes(path)
        dataframe = pm4py.convert_to_dataframe(log)
        print("done loading")
        print(dataframe.columns)


    def handle_import(self,is_xes, path, case_id, timestamp, activity,time_precision = time_precision.TimePrecision.NS,  sep = ",", formatting = True):
        """
        handles the import of the event log. 

        Args:
            is_xes (bool): If True, the event log is in XES format. If False, it is in CSV format.
            path (str): Path to the event log.
            case_id (str): Case id column name.
            timestamp (str): Timestamp column name.
            activity (str): Activity column name.
            time_precision (TimePrecision, optional): Time precision. Defaults to TimePrecision.NS. note that this functionality is INCOMPLETED. 
            sep (str, optional): Separator. Defaults to ",".
            formatting (bool, optional): If True, the event log is formatted so that it can be used by the RNN. Defaults to True. 
        """
        self.time_precision = time_precision
        self.case_id_key =  case_id
        self.case_activity_key =activity 
        self.case_timestamp_key =timestamp 
        if is_xes: 
            self.import_event_log_xes(path, formatting)
        else: 
            self.import_event_log_csv(path, sep, formatting)


    def import_event_log_xes(self, path, formatting=True):
        """
        Imports an event log in XES format.

        Args:
        path (str): Path to the XES file.
        formatting (bool, optional): If True, the event log is formatted so that it can be used by the RNN. Defaults to True.

        Effects:
        - event_df dataframe is generated.
        - The generated dataframe has 3 columns: case id (string), label (string), and timestamp (datetime64).
        - event log object: its correctness is assumed from the pm4py library and is therefore not tested.
        """
        self.event_df = pm4py.read.read_xes(path)
        self.event_df = pm4py.convert_to_dataframe(self.event_df)
        self.import_event_log(formatting)


    def import_event_log_csv(self, path, sep, formatting = True): 
        """
        This is an adapter for format_dataframe such that the event data can be properly used by the RNN.

        Args:
            path (str): Path to the event log.
            sep (str): Separator.
            formatting (bool, optional): If True, the event log is formatted so that it can be used by the RNN. Defaults to True.
        """
        self.event_df= pd.read_csv(path, sep=sep)
        self.import_event_log(formatting)


    def import_event_log_dataframe(self,df, case_id, activity_key, timestamp_key, formatting = True):
        """
        This is an adapter for format_dataframe such that the event data can be properly used by the RNN model.

        Args:
            path (str): Path to the event log.
            case_id (str): Case id column name.
            activity_key (str): Activity column name.
            timestamp_key (str): Timestamp column name.
            formatting (bool, optional): If True, the event log is formatted so that it can be used by the RNN. Defaults to True.
        """
        self.event_df = df
        self.case_id_key =  case_id
        self.case_activity_key =activity_key
        self.case_timestamp_key =timestamp_key
        self.import_event_log(formatting)


    def import_event_log(self, formatting):
        """
        helper function for import_event_log_csv and import_event_log_xes. 
        - genereates an EventLog object so that other pm4py functions can use it
        - remove all columns other than the three main ones
        - remove all NaN entries
        - format a dataframe using pm4py 
        Effects: 
        - rows sorted by case id and timestamp

        Args:
            formatting (bool): If True, the event log is formatted so that it can be used by the RNN.
        """
        #: returns a formated dataframe that can work with other pm4py functions
        self.event_df = pm4py.format_dataframe(self.event_df, 
                                           case_id=self.case_id_key,
                                             activity_key=self.case_activity_key,
                                             timestamp_key=self.case_timestamp_key) #returns formated df.

        #: convert_to_event_log requires string format for case_id and marker
        self.event_df[self.case_id_key] = self.event_df[self.case_id_key].astype("string")
        self.event_df[self.case_activity_key] = self.event_df[self.case_activity_key].astype("string")



        self.event_df[self.case_timestamp_key] = self.event_df[self.case_timestamp_key].astype("datetime64[ns, UTC]")




        self.event_log = pm4py.convert_to_event_log(self.event_df, self.case_id_key) #this returns an event log



        #: filter out all the other generated columns
        self.event_df= self.event_df[[self.case_id_key, self.case_activity_key, self.case_timestamp_key]]


        #: the rest should only be executed when training
        if not formatting:
            return


        self.event_df= self.event_df.dropna()
        #: used for conformance checking, save everything except the 
        # extra columns 
        self.unencoded_df = self.event_df.copy(deep = True)


        #: sort the rows by group id and timestamp key
        self.event_df =  self.event_df.sort_values(by=[self.case_id_key, self.case_timestamp_key])




        self.encode_df_columns()



    def string_to_index(self , df, column):
        """
        translate each marker into a specific integer index.  
        """
        col = df[column].tolist()
        uniques = set(col)
        enume = [(label, index) for index, label in enumerate(uniques)]
        return dict(enume)


    def encode_df_columns(self):
        """
        - encode the markers and case id's with integers (label encoding)
        - encode the timestamps
        - returns nothing, but modifies self.event_df

        The following holds for `self.event_df` after this function is called:
            - all columns are sorted by case id and timestamp
            - the case id and markers are encoded with integers
            - the timestamps are encoded as floats. timezone information is removed. 
        """
        #: we encode the markers with integers (label encoding) to be consistent with the authors implementation

        #: only initialize the encoders, if they dont exist yet



        if self.activity_le == None: 
            self.activity_le = LabelEncoder()
            self.event_df[self.case_activity_key] = self.activity_le.fit_transform(self.event_df[self.case_activity_key])
        else: 
            #: this will not overwrite the existing le
            self.event_df[self.case_activity_key] = self.activity_le.transform(self.event_df[self.case_activity_key])

        if self.case_id_le == None:
            self.case_id_le = LabelEncoder()
            self.event_df[self.case_id_key] = self.case_id_le.fit_transform(self.event_df[self.case_id_key])
            #: the other case is not necessary. the language encoders are only passed
            # to the preprocessor when doing predictions (not when creating a predictive log)
            # the case id is not used for making predicitons; and also does not make sense for
            # ongoing cases (in terms of using past behaviour to predict future behaviour)


        #: get the number of classes
        self.number_classes = len(self.event_df[self.case_activity_key].unique()) 
        #: trasnform back into strings, its necessary for pm4py
        self.event_df[self.case_activity_key] =self.event_df[self.case_activity_key].astype("str")
        self.event_df[self.case_id_key] =self.event_df[self.case_id_key].astype("str")

        #: compute abs. freq. distribution for the activities. its necessary for CrossEntropyLoss
        self.absolute_frequency_distribution= Counter(self.event_df[self.case_activity_key].to_list())

        # remove timezone information
        self.event_df[self.case_timestamp_key] = self.event_df[self.case_timestamp_key].dt.tz_localize(None)

        #: here we convert the datetime64 (ISO standard) into an integer in POSIX standard. the authors
        # use an Excel format, but we decide to use integers for simplicity.
        self.event_df[self.case_timestamp_key] = self.event_df[self.case_timestamp_key].astype(int)


        if self.time_precision == time_precision.TimePrecision.NS: 

            #: nanoseconds can cause numerical instability. therefore we make the number smaller by shifting the comma by `exponent`
            self.exponent = self.event_df[self.case_timestamp_key].astype(str).apply(lambda x: len(x)).mean()
            self.event_df[self.case_timestamp_key] = self.event_df[self.case_timestamp_key] / (10 ** self.exponent)
            #: note that other time precisions are not supported yet. TODO

        # #: transform the case id and markers back into float
        self.event_df[self.case_activity_key] = self.event_df[self.case_activity_key].astype("float64")
        self.event_df[self.case_id_key] = self.event_df[self.case_id_key].astype("float64")

    def split_train_test(self, train_percentage):
        """
        This is a helper function for splitting the event log into training and testing data.

        Args:
            train_percentage (float): The percentage of data to be used for training.

        Returns:
            tuple: A tuple containing two event logs (dataframes) for training and testing, the number of classes (for the markers), and the absolute frequency distribution for each class in the whole event log.
        """
        if train_percentage>=1 or train_percentage<=0: 
            raise exceptions.TrainPercentageTooHigh()

        cases = self.event_df[self.case_id_key].unique().tolist()
        train_cases = set()
        test_cases = set()
        for c in cases:
            r = random.random()
            if r <= train_percentage:
                train_cases.add(c)
            else:
                test_cases.add(c)
        train = self.event_df[self.event_df[self.case_id_key].isin(train_cases)]
        test = self.event_df[self.event_df[self.case_id_key].isin(test_cases)]

        if test.shape[0] == 0: 
            raise exceptions.TrainPercentageTooHigh()


        return train, test



    def find_start_activities(self):
        """
        find the start activities of all cases for an existing log and return a dict with start activities as keys and value is the count of this activity
        """
        start_activities = pm4py.stats.get_start_activities(self.event_log, activity_key=self.case_activity_key, case_id_key=self.case_id_key, timestamp_key=self.case_timestamp_key)
        return start_activities

    def find_end_activities(self):
        """"
        find the end activities of all cases for an existing log and return a dict with end activities as keys and value is the count of this activity
        """
        end_activities = pm4py.get_end_activities(self.event_log, activity_key=self.case_activity_key, case_id_key=self.case_id_key, timestamp_key=self.case_timestamp_key)
        activities = self.event_df[self.case_activity_key].unique()
        end_activity_lookup = {activity:False  for activity in activities}
        for activity in activities:
            end_activity_lookup[activity]= True
        return end_activity_lookup 


    def get_sample_case(self):
        """
        returns a sample of a case
        """
        sampled_dataframe = pm4py.sample_cases(self.event_log, 1, case_id_key=self.case_id_key)
        return sampled_dataframe



    def replace_activity_nan_with_mode(self): 
        """
        replaces NaN values in activity column with median
        """

        mode=  self.event_df[self.case_activity_key].mode()
        self.event_df[self.case_activity_key].fillna(mode, inplace = True)

        return True


    def remove_duplicate_rows(self): 
        #: removes the duplicates ie the rows where the same activity happened at the same time in the same case id.
        # since we are dropping all the other columns, these duplicates make no sense.
        self.event_df = self.event_df.drop_duplicates(subset=[self.case_id_key, self.case_activity_key, self.case_timestamp_key])
        return True


    def add_unique_start_end_activity(self):
        """
        if there is no unique start/ end activity, add an artificial start and end activity
        """
        if (len(self.find_start_activities()) != 1) or (len(self.find_end_activities()) != 1):
            processed_log= pm4py.insert_artificial_start_end(
                self.event_log, 
                activity_key=self.case_activity_key, 
                case_id_key=self.case_id_key, 
                timestamp_key=self.case_timestamp_key
            )
            self.event_df =pm4py.convert_to_dataframe(processed_log) 
            return True
        return False

`add_unique_start_end_activity()`

if there is no unique start/ end activity, add an artificial start and end activity

Source code in server/preprocessing.py

def add_unique_start_end_activity(self):
    """
    if there is no unique start/ end activity, add an artificial start and end activity
    """
    if (len(self.find_start_activities()) != 1) or (len(self.find_end_activities()) != 1):
        processed_log= pm4py.insert_artificial_start_end(
            self.event_log, 
            activity_key=self.case_activity_key, 
            case_id_key=self.case_id_key, 
            timestamp_key=self.case_timestamp_key
        )
        self.event_df =pm4py.convert_to_dataframe(processed_log) 
        return True
    return False

`encode_df_columns()`

encode the markers and case id's with integers (label encoding)
encode the timestamps
returns nothing, but modifies self.event_df

The following holds for self.event_df after this function is called: - all columns are sorted by case id and timestamp - the case id and markers are encoded with integers - the timestamps are encoded as floats. timezone information is removed.

Source code in server/preprocessing.py

def encode_df_columns(self):
    """
    - encode the markers and case id's with integers (label encoding)
    - encode the timestamps
    - returns nothing, but modifies self.event_df

    The following holds for `self.event_df` after this function is called:
        - all columns are sorted by case id and timestamp
        - the case id and markers are encoded with integers
        - the timestamps are encoded as floats. timezone information is removed. 
    """
    #: we encode the markers with integers (label encoding) to be consistent with the authors implementation

    #: only initialize the encoders, if they dont exist yet



    if self.activity_le == None: 
        self.activity_le = LabelEncoder()
        self.event_df[self.case_activity_key] = self.activity_le.fit_transform(self.event_df[self.case_activity_key])
    else: 
        #: this will not overwrite the existing le
        self.event_df[self.case_activity_key] = self.activity_le.transform(self.event_df[self.case_activity_key])

    if self.case_id_le == None:
        self.case_id_le = LabelEncoder()
        self.event_df[self.case_id_key] = self.case_id_le.fit_transform(self.event_df[self.case_id_key])
        #: the other case is not necessary. the language encoders are only passed
        # to the preprocessor when doing predictions (not when creating a predictive log)
        # the case id is not used for making predicitons; and also does not make sense for
        # ongoing cases (in terms of using past behaviour to predict future behaviour)


    #: get the number of classes
    self.number_classes = len(self.event_df[self.case_activity_key].unique()) 
    #: trasnform back into strings, its necessary for pm4py
    self.event_df[self.case_activity_key] =self.event_df[self.case_activity_key].astype("str")
    self.event_df[self.case_id_key] =self.event_df[self.case_id_key].astype("str")

    #: compute abs. freq. distribution for the activities. its necessary for CrossEntropyLoss
    self.absolute_frequency_distribution= Counter(self.event_df[self.case_activity_key].to_list())

    # remove timezone information
    self.event_df[self.case_timestamp_key] = self.event_df[self.case_timestamp_key].dt.tz_localize(None)

    #: here we convert the datetime64 (ISO standard) into an integer in POSIX standard. the authors
    # use an Excel format, but we decide to use integers for simplicity.
    self.event_df[self.case_timestamp_key] = self.event_df[self.case_timestamp_key].astype(int)


    if self.time_precision == time_precision.TimePrecision.NS: 

        #: nanoseconds can cause numerical instability. therefore we make the number smaller by shifting the comma by `exponent`
        self.exponent = self.event_df[self.case_timestamp_key].astype(str).apply(lambda x: len(x)).mean()
        self.event_df[self.case_timestamp_key] = self.event_df[self.case_timestamp_key] / (10 ** self.exponent)
        #: note that other time precisions are not supported yet. TODO

    # #: transform the case id and markers back into float
    self.event_df[self.case_activity_key] = self.event_df[self.case_activity_key].astype("float64")
    self.event_df[self.case_id_key] = self.event_df[self.case_id_key].astype("float64")

`find_end_activities()`

" find the end activities of all cases for an existing log and return a dict with end activities as keys and value is the count of this activity

Source code in server/preprocessing.py

def find_end_activities(self):
    """"
    find the end activities of all cases for an existing log and return a dict with end activities as keys and value is the count of this activity
    """
    end_activities = pm4py.get_end_activities(self.event_log, activity_key=self.case_activity_key, case_id_key=self.case_id_key, timestamp_key=self.case_timestamp_key)
    activities = self.event_df[self.case_activity_key].unique()
    end_activity_lookup = {activity:False  for activity in activities}
    for activity in activities:
        end_activity_lookup[activity]= True
    return end_activity_lookup 

`find_start_activities()`

find the start activities of all cases for an existing log and return a dict with start activities as keys and value is the count of this activity

Source code in server/preprocessing.py

def find_start_activities(self):
    """
    find the start activities of all cases for an existing log and return a dict with start activities as keys and value is the count of this activity
    """
    start_activities = pm4py.stats.get_start_activities(self.event_log, activity_key=self.case_activity_key, case_id_key=self.case_id_key, timestamp_key=self.case_timestamp_key)
    return start_activities

`get_sample_case()`

returns a sample of a case

Source code in server/preprocessing.py

def get_sample_case(self):
    """
    returns a sample of a case
    """
    sampled_dataframe = pm4py.sample_cases(self.event_log, 1, case_id_key=self.case_id_key)
    return sampled_dataframe

`handle_import(is_xes, path, case_id, timestamp, activity, time_precision=time_precision.TimePrecision.NS, sep=',', formatting=True)`

handles the import of the event log.

Parameters:

Name	Type	Description	Default
`is_xes`	`bool`	If True, the event log is in XES format. If False, it is in CSV format.	required
`path`	`str`	Path to the event log.	required
`case_id`	`str`	Case id column name.	required
`timestamp`	`str`	Timestamp column name.	required
`activity`	`str`	Activity column name.	required
`time_precision`	`TimePrecision`	Time precision. Defaults to TimePrecision.NS. note that this functionality is INCOMPLETED.	`NS`
`sep`	`str`	Separator. Defaults to ",".	`','`
`formatting`	`bool`	If True, the event log is formatted so that it can be used by the RNN. Defaults to True.	`True`

Source code in server/preprocessing.py

def handle_import(self,is_xes, path, case_id, timestamp, activity,time_precision = time_precision.TimePrecision.NS,  sep = ",", formatting = True):
    """
    handles the import of the event log. 

    Args:
        is_xes (bool): If True, the event log is in XES format. If False, it is in CSV format.
        path (str): Path to the event log.
        case_id (str): Case id column name.
        timestamp (str): Timestamp column name.
        activity (str): Activity column name.
        time_precision (TimePrecision, optional): Time precision. Defaults to TimePrecision.NS. note that this functionality is INCOMPLETED. 
        sep (str, optional): Separator. Defaults to ",".
        formatting (bool, optional): If True, the event log is formatted so that it can be used by the RNN. Defaults to True. 
    """
    self.time_precision = time_precision
    self.case_id_key =  case_id
    self.case_activity_key =activity 
    self.case_timestamp_key =timestamp 
    if is_xes: 
        self.import_event_log_xes(path, formatting)
    else: 
        self.import_event_log_csv(path, sep, formatting)

`import_event_log(formatting)`

helper function for import_event_log_csv and import_event_log_xes. - genereates an EventLog object so that other pm4py functions can use it - remove all columns other than the three main ones - remove all NaN entries - format a dataframe using pm4py Effects: - rows sorted by case id and timestamp

Parameters:

Name	Type	Description	Default
`formatting`	`bool`	If True, the event log is formatted so that it can be used by the RNN.	required

Source code in server/preprocessing.py

def import_event_log(self, formatting):
    """
    helper function for import_event_log_csv and import_event_log_xes. 
    - genereates an EventLog object so that other pm4py functions can use it
    - remove all columns other than the three main ones
    - remove all NaN entries
    - format a dataframe using pm4py 
    Effects: 
    - rows sorted by case id and timestamp

    Args:
        formatting (bool): If True, the event log is formatted so that it can be used by the RNN.
    """
    #: returns a formated dataframe that can work with other pm4py functions
    self.event_df = pm4py.format_dataframe(self.event_df, 
                                       case_id=self.case_id_key,
                                         activity_key=self.case_activity_key,
                                         timestamp_key=self.case_timestamp_key) #returns formated df.

    #: convert_to_event_log requires string format for case_id and marker
    self.event_df[self.case_id_key] = self.event_df[self.case_id_key].astype("string")
    self.event_df[self.case_activity_key] = self.event_df[self.case_activity_key].astype("string")



    self.event_df[self.case_timestamp_key] = self.event_df[self.case_timestamp_key].astype("datetime64[ns, UTC]")




    self.event_log = pm4py.convert_to_event_log(self.event_df, self.case_id_key) #this returns an event log



    #: filter out all the other generated columns
    self.event_df= self.event_df[[self.case_id_key, self.case_activity_key, self.case_timestamp_key]]


    #: the rest should only be executed when training
    if not formatting:
        return


    self.event_df= self.event_df.dropna()
    #: used for conformance checking, save everything except the 
    # extra columns 
    self.unencoded_df = self.event_df.copy(deep = True)


    #: sort the rows by group id and timestamp key
    self.event_df =  self.event_df.sort_values(by=[self.case_id_key, self.case_timestamp_key])




    self.encode_df_columns()

`import_event_log_csv(path, sep, formatting=True)`

This is an adapter for format_dataframe such that the event data can be properly used by the RNN.

Parameters:

Name	Type	Description	Default
`path`	`str`	Path to the event log.	required
`sep`	`str`	Separator.	required
`formatting`	`bool`	If True, the event log is formatted so that it can be used by the RNN. Defaults to True.	`True`

Source code in server/preprocessing.py

def import_event_log_csv(self, path, sep, formatting = True): 
    """
    This is an adapter for format_dataframe such that the event data can be properly used by the RNN.

    Args:
        path (str): Path to the event log.
        sep (str): Separator.
        formatting (bool, optional): If True, the event log is formatted so that it can be used by the RNN. Defaults to True.
    """
    self.event_df= pd.read_csv(path, sep=sep)
    self.import_event_log(formatting)

`import_event_log_dataframe(df, case_id, activity_key, timestamp_key, formatting=True)`

This is an adapter for format_dataframe such that the event data can be properly used by the RNN model.

Parameters:

Name	Type	Description	Default
`path`	`str`	Path to the event log.	required
`case_id`	`str`	Case id column name.	required
`activity_key`	`str`	Activity column name.	required
`timestamp_key`	`str`	Timestamp column name.	required
`formatting`	`bool`	If True, the event log is formatted so that it can be used by the RNN. Defaults to True.	`True`

Source code in server/preprocessing.py

def import_event_log_dataframe(self,df, case_id, activity_key, timestamp_key, formatting = True):
    """
    This is an adapter for format_dataframe such that the event data can be properly used by the RNN model.

    Args:
        path (str): Path to the event log.
        case_id (str): Case id column name.
        activity_key (str): Activity column name.
        timestamp_key (str): Timestamp column name.
        formatting (bool, optional): If True, the event log is formatted so that it can be used by the RNN. Defaults to True.
    """
    self.event_df = df
    self.case_id_key =  case_id
    self.case_activity_key =activity_key
    self.case_timestamp_key =timestamp_key
    self.import_event_log(formatting)

`import_event_log_xes(path, formatting=True)`

Imports an event log in XES format.

Args: path (str): Path to the XES file. formatting (bool, optional): If True, the event log is formatted so that it can be used by the RNN. Defaults to True.

Effects: - event_df dataframe is generated. - The generated dataframe has 3 columns: case id (string), label (string), and timestamp (datetime64). - event log object: its correctness is assumed from the pm4py library and is therefore not tested.

Source code in server/preprocessing.py

def import_event_log_xes(self, path, formatting=True):
    """
    Imports an event log in XES format.

    Args:
    path (str): Path to the XES file.
    formatting (bool, optional): If True, the event log is formatted so that it can be used by the RNN. Defaults to True.

    Effects:
    - event_df dataframe is generated.
    - The generated dataframe has 3 columns: case id (string), label (string), and timestamp (datetime64).
    - event log object: its correctness is assumed from the pm4py library and is therefore not tested.
    """
    self.event_df = pm4py.read.read_xes(path)
    self.event_df = pm4py.convert_to_dataframe(self.event_df)
    self.import_event_log(formatting)

`replace_activity_nan_with_mode()`

replaces NaN values in activity column with median

Source code in server/preprocessing.py

def replace_activity_nan_with_mode(self): 
    """
    replaces NaN values in activity column with median
    """

    mode=  self.event_df[self.case_activity_key].mode()
    self.event_df[self.case_activity_key].fillna(mode, inplace = True)

    return True

`split_train_test(train_percentage)`

This is a helper function for splitting the event log into training and testing data.

Parameters:

Name	Type	Description	Default
`train_percentage`	`float`	The percentage of data to be used for training.	required

Returns:

Name	Type	Description
`tuple`		A tuple containing two event logs (dataframes) for training and testing, the number of classes (for the markers), and the absolute frequency distribution for each class in the whole event log.

Source code in server/preprocessing.py

def split_train_test(self, train_percentage):
    """
    This is a helper function for splitting the event log into training and testing data.

    Args:
        train_percentage (float): The percentage of data to be used for training.

    Returns:
        tuple: A tuple containing two event logs (dataframes) for training and testing, the number of classes (for the markers), and the absolute frequency distribution for each class in the whole event log.
    """
    if train_percentage>=1 or train_percentage<=0: 
        raise exceptions.TrainPercentageTooHigh()

    cases = self.event_df[self.case_id_key].unique().tolist()
    train_cases = set()
    test_cases = set()
    for c in cases:
        r = random.random()
        if r <= train_percentage:
            train_cases.add(c)
        else:
            test_cases.add(c)
    train = self.event_df[self.event_df[self.case_id_key].isin(train_cases)]
    test = self.event_df[self.event_df[self.case_id_key].isin(test_cases)]

    if test.shape[0] == 0: 
        raise exceptions.TrainPercentageTooHigh()


    return train, test

`string_to_index(df, column)`

translate each marker into a specific integer index.

Source code in server/preprocessing.py

def string_to_index(self , df, column):
    """
    translate each marker into a specific integer index.  
    """
    col = df[column].tolist()
    uniques = set(col)
    enume = [(label, index) for index, label in enumerate(uniques)]
    return dict(enume)

`xes_helper(path)`

just a testing function

Source code in server/preprocessing.py

def xes_helper(self, path): 
    """just a testing function"""
    log =pm4py.read.read_xes(path)
    dataframe = pm4py.convert_to_dataframe(log)
    print("done loading")
    print(dataframe.columns)

NN management module

This module is in charge of training the NN Model and also testing it.

The generated model may be exported and also imported later on by this class.

it supports manual trainig, random search and grid search.

`Config`

class containing the configuration for the model.

Attributes:

Name	Type	Description
`seq_len`	`int`	The sequence length used for the sliding window.
`emb_dim`	`int`	The embedding dimension.
`hid_dim`	`int`	The hidden dimension.
`mlp_dim`	`int`	The MLP dimension used for the LSTM.
`batch_size`	`int`	The batch size.
`alpha`	`float`	The alpha value.
`dropout`	`float`	The dropout value.
`time_precision`	`TimePrecision`	The time precision. Only NS is supported.
`lr`	`float`	The learning rate.
`epochs`	`int`	The number of epochs.
`importance_weight`	`str`	The importance weight. (set to a default value as in the RMTPP implementation)
`verbose_step`	`int`	The verbose step, just for logging purposes.
`cuda`	`bool`	Whether to use the GPU.
`absolute_frequency_distribution`	`Counter`	The absolute frequency distribution of the classes.
`case_id_le`	`LabelEncoder`	The case ID label encoder.
`activity_le`	`LabelEncoder`	The activity label encoder.
`exponent`	`int`	The exponent used for the time conversion (see the preprocessing module).
`number_classes`	`int`	The number of possible activities in the data.
`case_activity_key`	`str`	The case activity key.
`case_timestamp_key`	`str`	The case timestamp key.
`case_id_key`	`str`	The case ID key.

Source code in server/nn_manager.py

class Config: 
    """
    class containing the configuration for the model. 

    Attributes:
        seq_len (int): The sequence length used for the sliding window. 
        emb_dim (int): The embedding dimension.
        hid_dim (int): The hidden dimension.
        mlp_dim (int): The MLP dimension used for the LSTM.
        batch_size (int): The batch size.
        alpha (float): The alpha value.
        dropout (float): The dropout value.
        time_precision (TimePrecision): The time precision. Only NS is supported.
        lr (float): The learning rate.
        epochs (int): The number of epochs.
        importance_weight (str): The importance weight. (set to a default value as in the RMTPP implementation)
        verbose_step (int): The verbose step, just for logging purposes.
        cuda (bool): Whether to use the GPU.
        absolute_frequency_distribution (Counter): The absolute frequency distribution of the classes.
        case_id_le (LabelEncoder): The case ID label encoder.
        activity_le (LabelEncoder): The activity label encoder.
        exponent (int): The exponent used for the time conversion (see the preprocessing module).
        number_classes (int): The number of possible activities in the data.
        case_activity_key (str): The case activity key.
        case_timestamp_key (str): The case timestamp key.
        case_id_key (str): The case ID key.
    """
    def __init__(self):
        self.seq_len: int= 10
        self.emb_dim: int= 1500
        self.hid_dim:int=1500
        self.mlp_dim:int= 1500
        self.batch_size:int = 1024
        self.alpha: float= 0.05
        self.dropout:float= 0.1
        self.time_precision:time_precision.TimePrecision = time_precision.TimePrecision.NS
        self.lr:float = 1e-3
        self.epochs: int = 3 
        self.importance_weight: str = "store_true"
        self.verbose_step: int = 350
        self.cuda: bool = False 
        self.absolute_frequency_distribution:Counter = Counter()
        self.case_id_le:LabelEncoder = None
        self.activity_le:LabelEncoder = None
        self.exponent:int = None
        self.number_classes:int = 0
        self.case_activity_key: str=""
        self.case_timestamp_key:str =""
        self.case_id_key:str = ""
    def asdict(self):
        """
        used for exporting as a dictionary 
        """
        return {
            "time_precision": self.time_precision.name,
            "seq_len": self.seq_len,
            "emb_dim": self.emb_dim,
            "hid_dim": self.hid_dim,
            "mlp_dim": self.mlp_dim,
            "alpha": self.alpha,
            "dropout": self.dropout,
            "batch_size": self.batch_size,
            "lr": self.lr,
            "epochs": self.epochs,
            "importance_weight": self.importance_weight,
            "verbose_step": self.verbose_step,
            "cuda": self.cuda,
            "absolute_frequency_distribution": dict(self.absolute_frequency_distribution),
            "case_id_le": self.encoder_to_dict(self.case_id_le),
            "activity_le": self.encoder_to_dict(self.activity_le),
            "exponent": self.exponent,
            "number_classes": self.number_classes,
            "case_id_key": self.case_id_key,
            "case_timestamp_key":self.case_timestamp_key, 
            "case_activity_key": self.case_activity_key
        }
    def load_config(self, dic):
        """
        used for importing
        """
        self.time_precision= time_precision.TimePrecision[dic["time_precision"]] 
        self.seq_len=int(dic["seq_len"])
        self.emb_dim=int(dic["emb_dim"])
        self.hid_dim=int(dic["hid_dim"])
        self.mlp_dim=int(dic["mlp_dim"])
        self.alpha=float(dic["alpha"])
        self.dropout=float(dic["dropout"])
        self.batch_size=int(dic["batch_size"])
        self.lr=float(dic["lr"])
        self.epochs=int(dic["epochs"])
        self.importance_weight =dic["importance_weight"] #string
        self.verbose_step = int(dic["verbose_step"])
        self.cuda = True if dic["cuda"]=="True" else False  
        self.absolute_frequency_distribution = Counter(["absolute_frequency_distribution"])
        self.case_id_le = self.dict_to_encoder(dic["case_id_le"])
        self.activity_le = self.dict_to_encoder(dic["activity_le"])
        self.exponent =int(dic["exponent"])
        self.number_classes =int(dic["number_classes"])
        self.case_activity_key = dic["case_activity_key"]
        self.case_id_key = dic["case_id_key"]
        self.case_timestamp_key = dic["case_timestamp_key"]

    def encoder_to_dict(self, encoder:LabelEncoder)->dict:
        """
        cast the encoder to a dictionary 
        """
        return {label:index for index, label in enumerate(encoder.classes_)} 

    def dict_to_encoder(self, dic:dict)->LabelEncoder:
        """
        cast the dictionary to an encoder
        """

        encoder = LabelEncoder()
        encoder.classes_ = np.array(list(dic.keys()))
        return encoder

`asdict()`

used for exporting as a dictionary

Source code in server/nn_manager.py

def asdict(self):
    """
    used for exporting as a dictionary 
    """
    return {
        "time_precision": self.time_precision.name,
        "seq_len": self.seq_len,
        "emb_dim": self.emb_dim,
        "hid_dim": self.hid_dim,
        "mlp_dim": self.mlp_dim,
        "alpha": self.alpha,
        "dropout": self.dropout,
        "batch_size": self.batch_size,
        "lr": self.lr,
        "epochs": self.epochs,
        "importance_weight": self.importance_weight,
        "verbose_step": self.verbose_step,
        "cuda": self.cuda,
        "absolute_frequency_distribution": dict(self.absolute_frequency_distribution),
        "case_id_le": self.encoder_to_dict(self.case_id_le),
        "activity_le": self.encoder_to_dict(self.activity_le),
        "exponent": self.exponent,
        "number_classes": self.number_classes,
        "case_id_key": self.case_id_key,
        "case_timestamp_key":self.case_timestamp_key, 
        "case_activity_key": self.case_activity_key
    }

`dict_to_encoder(dic)`

cast the dictionary to an encoder

Source code in server/nn_manager.py

def dict_to_encoder(self, dic:dict)->LabelEncoder:
    """
    cast the dictionary to an encoder
    """

    encoder = LabelEncoder()
    encoder.classes_ = np.array(list(dic.keys()))
    return encoder

`encoder_to_dict(encoder)`

cast the encoder to a dictionary

Source code in server/nn_manager.py

def encoder_to_dict(self, encoder:LabelEncoder)->dict:
    """
    cast the encoder to a dictionary 
    """
    return {label:index for index, label in enumerate(encoder.classes_)} 

`load_config(dic)`

used for importing

Source code in server/nn_manager.py

def load_config(self, dic):
    """
    used for importing
    """
    self.time_precision= time_precision.TimePrecision[dic["time_precision"]] 
    self.seq_len=int(dic["seq_len"])
    self.emb_dim=int(dic["emb_dim"])
    self.hid_dim=int(dic["hid_dim"])
    self.mlp_dim=int(dic["mlp_dim"])
    self.alpha=float(dic["alpha"])
    self.dropout=float(dic["dropout"])
    self.batch_size=int(dic["batch_size"])
    self.lr=float(dic["lr"])
    self.epochs=int(dic["epochs"])
    self.importance_weight =dic["importance_weight"] #string
    self.verbose_step = int(dic["verbose_step"])
    self.cuda = True if dic["cuda"]=="True" else False  
    self.absolute_frequency_distribution = Counter(["absolute_frequency_distribution"])
    self.case_id_le = self.dict_to_encoder(dic["case_id_le"])
    self.activity_le = self.dict_to_encoder(dic["activity_le"])
    self.exponent =int(dic["exponent"])
    self.number_classes =int(dic["number_classes"])
    self.case_activity_key = dic["case_activity_key"]
    self.case_id_key = dic["case_id_key"]
    self.case_timestamp_key = dic["case_timestamp_key"]

`NNManagement`

This is the NNMamangement class.

Provided functinality

Train the model based on the event log.
Test the model based on the event log.
Set params.

Source code in server/nn_manager.py

class NNManagement: 
    """
    This is the NNMamangement class. 

    Provided functinality: 
        - Train the model based on the event log. 
        - Test the model based on the event log.
        - Set params. 

    """
    def __init__(self, config:Config|None = None):
        self.config = Config() if config == None else config
        self.f1 = None
        self.recall= None
        self.acc = None
        self.time_error = None


    def evaluate(self):
        """
        This is the testing function for the model. It prints out the time_error, precision, recall, and f1 score.

        Returns:
            time_error (float): The time error.
            acc (float): The accuracy.
            recall (float): The recall.
            f1 (float): The F1 score.
        """
        #: main testing function
        self.model.eval()
        pred_times, pred_events = [], [] #inputs/training data
        gold_times, gold_events = [], [] #targets

        for i, batch in enumerate(tqdm(self.test_loader)):
            #batch: pair with two tensors, each containing respectively the time and event data.  
            gold_times.append(batch[0][:, -1].numpy()) # extract for each sequence the last time stamp/ the last event
            gold_events.append(batch[1][:, -1].numpy())
            pred_time, pred_event = self.model.predict(batch)
            if np.isnan(pred_times).any():
                raise exceptions.NaNException()
            pred_times.append(pred_time)
            pred_events.append(pred_event)

        pred_times = np.concatenate(pred_times).reshape(-1)
        print(type(pred_times))
        gold_times = np.concatenate(gold_times).reshape(-1)
        pred_events = np.concatenate(pred_events).reshape(-1)
        gold_events = np.concatenate(gold_events).reshape(-1)
        self.time_error = RMTPP_torch.abs_error(pred_times, gold_times)  #compute errors

        self.acc, self.recall, self.f1 = RMTPP_torch.clf_metric(pred_events, gold_events, n_class=self.config.number_classes) #get the metrics
        print(f"time_error: {self.time_error}, PRECISION: {self.acc}, RECALL: {self.recall}, F1: {self.f1}")

        return self.time_error, self.acc, self.recall, self.f1

    def get_training_statistics(self):
        """
        Returns:
            str: The accuracy, recall, and F1 score as a JSON object in string format.
        """
        if self.acc == None and self.recall == None and self.f1 ==None: 
            raise exceptions.ModelNotTrainedYet()

        #: dumps generates a string
        return {
            "time error": str(self.time_error),
            "acc":self.acc, 
            "recall":self.recall,
            "f1":self.f1
        }

    def import_nn_model(self, path):
        """
        imports a .pt file
        """
        saved_model_contents = torch.load(path)
        config = saved_model_contents['config']
        lossweight = saved_model_contents['lossweight']
        self.model = RMTPP_torch.Net(config, lossweight)
        if config.cuda: 
            self.model.to("cuda")
        else:
            self.model.to("cpu")
        self.model.load_state_dict(saved_model_contents['model_state_dict'])
        self.model.optimizer.load_state_dict(saved_model_contents['optimizer_state_dict'])
        self.model.eval() # relevant for droput layers.


    def export_nn_model(self, name="trained_model.pt"):
        """
        generates the .pt file containing the generated
        model. 

        model state dict contains
        optimizer state dict
        """
        dic = {
            'model_state_dict': self.model.state_dict(),
            'optimizer_state_dict': self.model.optimizer.state_dict(),
            'config': self.model.config, 
            'lossweight': self.model.lossweight
        }
        torch.save(dic, name)

    def random_search(self, search_parameters, iterations): 
        """
        Random search for the best hyperparameters. Saves the best model in the class.

        We only do this for the hid_dim, mlp_dim and emb_dim parameters. (decided arbitrarily, can be extended to other parameters as well.)

        Args:
            search_parameters (dict): Dictionary containing the search parameters.
                - 'hid_dim': [start, end]
                - 'mlp_dim': [start, end]
                - 'emb_dim': [start, end]
            iterations (int): Number of iterations.

        Returns:
            float: The best accuracy.
        """
        acc = 0
        best_model = None
        # self.load_data(train, test, case_id_key, timestamp_key, case_activity_key)
        for i in range(iterations): 
            a=random.randint(search_parameters["hid_dim"][0], search_parameters["hid_dim"][1])
            b=random.randint(search_parameters["mlp_dim"][0], search_parameters["mlp_dim"][1])
            c=  random.randint(search_parameters["emb_dim"][0], search_parameters["emb_dim"][1])
            self.config.hid_dim = a
            self.config.emb_dim= b
            self.config.mlp_dim=c
            self.train()
            if self.acc> acc: 
                self.config.hid_dim = a
                self.config.emb_dim= b
                self.config.mlp_dim=c
                acc = self.acc
                best_model = self.model
        self.model = best_model
        print(f"best accuracy: {acc}")
        return acc

    def grid_search(self, search_parameters): 
        """
        Grid search for the best hyperparameters.

        We only do this for the hid_dim, mlp_dim and emb_dim parameters. (decided arbitrarily, can be extended to other parameters as well.)

        Args:
            search_parameters (dict): Dictionary containing the search parameters.
            - 'hid_dim': [start, end, step]
            - 'mlp_dim': [start, end, step]
            - 'emb_dim': [start, end, step]

        Returns:
            float: The best accuracy.
        """
        acc = 0
        best_model = None
        # self.load_data(train, test, case_id_key, timestamp_key, case_activity_key)
        for i in range(search_parameters["hid_dim"][0], search_parameters["hid_dim"][1], search_parameters["hid_dim"][2]): 
                self.config.hid_dim =i 
                for j in range(search_parameters["mlp_dim"][0], search_parameters["mlp_dim"][1], search_parameters["mlp_dim"][2]): 
                    self.config.mlp_dim=j
                    for k in range(search_parameters["emb_dim"][0], search_parameters["emb_dim"][1], search_parameters["emb_dim"][2]):
                        self.config.emb_dim=k
                        self.config.our_implementation = True
                        self.train()
                        best_model = self.model
                        if self.acc> acc: 
                            acc = self.acc
        self.model = best_model
        print(f"best acc: {acc}")
        return acc

    def load_data(self,train_data, test_data, case_id, timestamp_key, event_key ):
        """
        imports a training and testing sublogs, which were preprocessed by the preprocessing module.

        it applies the sliding window algorithm to have subsequences of the same fixed length. 
        The output is passed to the respective DataLoader object, which computes the time differences 
        and casts the input to tensors;  generates the batches.
        """

        # apply sliding window
        train_set = RMTPP_torch.ATMDataset(self.config ,train_data, case_id,   timestamp_key, event_key ) 
        test_set =  RMTPP_torch.ATMDataset(self.config , test_data, case_id, timestamp_key, event_key)
        # generate time differences and load the data to torch tensors and generate the batches. 
        self.train_loader = DataLoader(train_set, batch_size=self.config.batch_size, shuffle=True, collate_fn= RMTPP_torch.ATMDataset.to_features)
        self.test_loader = DataLoader(test_set, batch_size=self.config.batch_size, shuffle=False, collate_fn= RMTPP_torch.ATMDataset.to_features)

        #: initialize a matrix to store the importance weights that will be passed to the CrossEntropyLoss object. 
        self.weight = np.ones(self.config.number_classes)
        if self.config.importance_weight:
            self.weight = train_set.importance_weight(self.config.absolute_frequency_distribution)



    def train(self):
            """
            This is the main training function.

            Args:
                train_data (DataFrame): The training data.
                test_data (DataFrame): The test data.
                case_id (str): The column name of the case ID in the data.
                timestamp_key (str): The key of the timestamp in the data.
                no_classes (int): The number of known markers.
            """
            # we already pass the split data to de ATM loader. ATMDAtaset uses the sliding window for generating the input for training.
            # since we are using tensors for training the sequence length remains fixed in each epoch, hence we cannot do "arbitrary length cuts" 
            # to the training data
            self.model =  RMTPP_torch.Net(self.config, lossweight=self.weight) #crete a NN instance
            self.model.set_optimizer(total_step=len(self.train_loader) * self.config.epochs) 


            if self.config.cuda: 
                self.model.cuda() #GPU 

            for epc in range(self.config.epochs): #run the epochs
                self.model.train()  
                range_loss1 = range_loss2 = range_loss = 0
                for i, batch in enumerate(tqdm(self.train_loader)):

                    l1, l2, l = self.model.train_batch(batch) 
                    range_loss1 += l1
                    range_loss2 += l2
                    range_loss += l

                    if (i + 1) % self.config.verbose_step == 0:
                        print("time loss: ", range_loss1 / self.config.verbose_step)
                        print("event loss:", range_loss2 / self.config.verbose_step)
                        print("total loss:", range_loss / self.config.verbose_step)
                        range_loss1 = range_loss2 = range_loss = 0


            self.evaluate()

`evaluate()`

This is the testing function for the model. It prints out the time_error, precision, recall, and f1 score.

Returns:

Name	Type	Description
`time_error`	`float`	The time error.
`acc`	`float`	The accuracy.
`recall`	`float`	The recall.
`f1`	`float`	The F1 score.

Source code in server/nn_manager.py

def evaluate(self):
    """
    This is the testing function for the model. It prints out the time_error, precision, recall, and f1 score.

    Returns:
        time_error (float): The time error.
        acc (float): The accuracy.
        recall (float): The recall.
        f1 (float): The F1 score.
    """
    #: main testing function
    self.model.eval()
    pred_times, pred_events = [], [] #inputs/training data
    gold_times, gold_events = [], [] #targets

    for i, batch in enumerate(tqdm(self.test_loader)):
        #batch: pair with two tensors, each containing respectively the time and event data.  
        gold_times.append(batch[0][:, -1].numpy()) # extract for each sequence the last time stamp/ the last event
        gold_events.append(batch[1][:, -1].numpy())
        pred_time, pred_event = self.model.predict(batch)
        if np.isnan(pred_times).any():
            raise exceptions.NaNException()
        pred_times.append(pred_time)
        pred_events.append(pred_event)

    pred_times = np.concatenate(pred_times).reshape(-1)
    print(type(pred_times))
    gold_times = np.concatenate(gold_times).reshape(-1)
    pred_events = np.concatenate(pred_events).reshape(-1)
    gold_events = np.concatenate(gold_events).reshape(-1)
    self.time_error = RMTPP_torch.abs_error(pred_times, gold_times)  #compute errors

    self.acc, self.recall, self.f1 = RMTPP_torch.clf_metric(pred_events, gold_events, n_class=self.config.number_classes) #get the metrics
    print(f"time_error: {self.time_error}, PRECISION: {self.acc}, RECALL: {self.recall}, F1: {self.f1}")

    return self.time_error, self.acc, self.recall, self.f1

`export_nn_model(name='trained_model.pt')`

generates the .pt file containing the generated model.

model state dict contains optimizer state dict

Source code in server/nn_manager.py

def export_nn_model(self, name="trained_model.pt"):
    """
    generates the .pt file containing the generated
    model. 

    model state dict contains
    optimizer state dict
    """
    dic = {
        'model_state_dict': self.model.state_dict(),
        'optimizer_state_dict': self.model.optimizer.state_dict(),
        'config': self.model.config, 
        'lossweight': self.model.lossweight
    }
    torch.save(dic, name)

`get_training_statistics()`

Returns:

Name	Type	Description
`str`		The accuracy, recall, and F1 score as a JSON object in string format.

Source code in server/nn_manager.py

def get_training_statistics(self):
    """
    Returns:
        str: The accuracy, recall, and F1 score as a JSON object in string format.
    """
    if self.acc == None and self.recall == None and self.f1 ==None: 
        raise exceptions.ModelNotTrainedYet()

    #: dumps generates a string
    return {
        "time error": str(self.time_error),
        "acc":self.acc, 
        "recall":self.recall,
        "f1":self.f1
    }

`grid_search(search_parameters)`

Grid search for the best hyperparameters.

We only do this for the hid_dim, mlp_dim and emb_dim parameters. (decided arbitrarily, can be extended to other parameters as well.)

Parameters:

Name	Type	Description	Default
`search_parameters`	`dict`	Dictionary containing the search parameters.	required
`-`	`hid_dim`	[start, end, step]	required
`-`	`mlp_dim`	[start, end, step]	required
`-`	`emb_dim`	[start, end, step]	required

Returns:

Name	Type	Description
`float`		The best accuracy.

Source code in server/nn_manager.py

def grid_search(self, search_parameters): 
    """
    Grid search for the best hyperparameters.

    We only do this for the hid_dim, mlp_dim and emb_dim parameters. (decided arbitrarily, can be extended to other parameters as well.)

    Args:
        search_parameters (dict): Dictionary containing the search parameters.
        - 'hid_dim': [start, end, step]
        - 'mlp_dim': [start, end, step]
        - 'emb_dim': [start, end, step]

    Returns:
        float: The best accuracy.
    """
    acc = 0
    best_model = None
    # self.load_data(train, test, case_id_key, timestamp_key, case_activity_key)
    for i in range(search_parameters["hid_dim"][0], search_parameters["hid_dim"][1], search_parameters["hid_dim"][2]): 
            self.config.hid_dim =i 
            for j in range(search_parameters["mlp_dim"][0], search_parameters["mlp_dim"][1], search_parameters["mlp_dim"][2]): 
                self.config.mlp_dim=j
                for k in range(search_parameters["emb_dim"][0], search_parameters["emb_dim"][1], search_parameters["emb_dim"][2]):
                    self.config.emb_dim=k
                    self.config.our_implementation = True
                    self.train()
                    best_model = self.model
                    if self.acc> acc: 
                        acc = self.acc
    self.model = best_model
    print(f"best acc: {acc}")
    return acc

`import_nn_model(path)`

imports a .pt file

Source code in server/nn_manager.py

def import_nn_model(self, path):
    """
    imports a .pt file
    """
    saved_model_contents = torch.load(path)
    config = saved_model_contents['config']
    lossweight = saved_model_contents['lossweight']
    self.model = RMTPP_torch.Net(config, lossweight)
    if config.cuda: 
        self.model.to("cuda")
    else:
        self.model.to("cpu")
    self.model.load_state_dict(saved_model_contents['model_state_dict'])
    self.model.optimizer.load_state_dict(saved_model_contents['optimizer_state_dict'])
    self.model.eval() # relevant for droput layers.

`load_data(train_data, test_data, case_id, timestamp_key, event_key)`

imports a training and testing sublogs, which were preprocessed by the preprocessing module.

it applies the sliding window algorithm to have subsequences of the same fixed length. The output is passed to the respective DataLoader object, which computes the time differences and casts the input to tensors; generates the batches.

Source code in server/nn_manager.py

def load_data(self,train_data, test_data, case_id, timestamp_key, event_key ):
    """
    imports a training and testing sublogs, which were preprocessed by the preprocessing module.

    it applies the sliding window algorithm to have subsequences of the same fixed length. 
    The output is passed to the respective DataLoader object, which computes the time differences 
    and casts the input to tensors;  generates the batches.
    """

    # apply sliding window
    train_set = RMTPP_torch.ATMDataset(self.config ,train_data, case_id,   timestamp_key, event_key ) 
    test_set =  RMTPP_torch.ATMDataset(self.config , test_data, case_id, timestamp_key, event_key)
    # generate time differences and load the data to torch tensors and generate the batches. 
    self.train_loader = DataLoader(train_set, batch_size=self.config.batch_size, shuffle=True, collate_fn= RMTPP_torch.ATMDataset.to_features)
    self.test_loader = DataLoader(test_set, batch_size=self.config.batch_size, shuffle=False, collate_fn= RMTPP_torch.ATMDataset.to_features)

    #: initialize a matrix to store the importance weights that will be passed to the CrossEntropyLoss object. 
    self.weight = np.ones(self.config.number_classes)
    if self.config.importance_weight:
        self.weight = train_set.importance_weight(self.config.absolute_frequency_distribution)

`random_search(search_parameters, iterations)`

Random search for the best hyperparameters. Saves the best model in the class.

We only do this for the hid_dim, mlp_dim and emb_dim parameters. (decided arbitrarily, can be extended to other parameters as well.)

Parameters:

Name	Type	Description	Default
`search_parameters`	`dict`	Dictionary containing the search parameters. - 'hid_dim': [start, end] - 'mlp_dim': [start, end] - 'emb_dim': [start, end]	required
`iterations`	`int`	Number of iterations.	required

Returns:

Name	Type	Description
`float`		The best accuracy.

Source code in server/nn_manager.py

def random_search(self, search_parameters, iterations): 
    """
    Random search for the best hyperparameters. Saves the best model in the class.

    We only do this for the hid_dim, mlp_dim and emb_dim parameters. (decided arbitrarily, can be extended to other parameters as well.)

    Args:
        search_parameters (dict): Dictionary containing the search parameters.
            - 'hid_dim': [start, end]
            - 'mlp_dim': [start, end]
            - 'emb_dim': [start, end]
        iterations (int): Number of iterations.

    Returns:
        float: The best accuracy.
    """
    acc = 0
    best_model = None
    # self.load_data(train, test, case_id_key, timestamp_key, case_activity_key)
    for i in range(iterations): 
        a=random.randint(search_parameters["hid_dim"][0], search_parameters["hid_dim"][1])
        b=random.randint(search_parameters["mlp_dim"][0], search_parameters["mlp_dim"][1])
        c=  random.randint(search_parameters["emb_dim"][0], search_parameters["emb_dim"][1])
        self.config.hid_dim = a
        self.config.emb_dim= b
        self.config.mlp_dim=c
        self.train()
        if self.acc> acc: 
            self.config.hid_dim = a
            self.config.emb_dim= b
            self.config.mlp_dim=c
            acc = self.acc
            best_model = self.model
    self.model = best_model
    print(f"best accuracy: {acc}")
    return acc

`train()`

This is the main training function.

Parameters:

Name	Type	Description	Default
`train_data`	`DataFrame`	The training data.	required
`test_data`	`DataFrame`	The test data.	required
`case_id`	`str`	The column name of the case ID in the data.	required
`timestamp_key`	`str`	The key of the timestamp in the data.	required
`no_classes`	`int`	The number of known markers.	required

Source code in server/nn_manager.py

def train(self):
        """
        This is the main training function.

        Args:
            train_data (DataFrame): The training data.
            test_data (DataFrame): The test data.
            case_id (str): The column name of the case ID in the data.
            timestamp_key (str): The key of the timestamp in the data.
            no_classes (int): The number of known markers.
        """
        # we already pass the split data to de ATM loader. ATMDAtaset uses the sliding window for generating the input for training.
        # since we are using tensors for training the sequence length remains fixed in each epoch, hence we cannot do "arbitrary length cuts" 
        # to the training data
        self.model =  RMTPP_torch.Net(self.config, lossweight=self.weight) #crete a NN instance
        self.model.set_optimizer(total_step=len(self.train_loader) * self.config.epochs) 


        if self.config.cuda: 
            self.model.cuda() #GPU 

        for epc in range(self.config.epochs): #run the epochs
            self.model.train()  
            range_loss1 = range_loss2 = range_loss = 0
            for i, batch in enumerate(tqdm(self.train_loader)):

                l1, l2, l = self.model.train_batch(batch) 
                range_loss1 += l1
                range_loss2 += l2
                range_loss += l

                if (i + 1) % self.config.verbose_step == 0:
                    print("time loss: ", range_loss1 / self.config.verbose_step)
                    print("event loss:", range_loss2 / self.config.verbose_step)
                    print("total loss:", range_loss / self.config.verbose_step)
                    range_loss1 = range_loss2 = range_loss = 0


        self.evaluate()

Prediction management module

This module is in charge of administrating prediction generation.

The two following of predictions can be made

single predictions (one step in the future and get the most likely (event, timestamp) pair)
multiple predictions (generate a predictive tree). these can be saved in a file.

Predictions are also decoded.

This module is also used by the process_model_manager module, which calls the multiple prediction manager repeatedly. Since this other manager supports different options in relation to how the cut sequences should be restored, the parametrized function multiple_prediction_linear is implemented; which grants some runtime benefits.

`PredictionManager`

Source code in server/prediction_manager.py

class PredictionManager: 
    def __init__(self, model, case_id_key, activity_key, timestamp_key, config):
        """
        Initializes the PredictionManager object.

        Args:
            model (object): The model used for doing predictions.
            case_id_key (str): The case id key of the log.
            activity_key (str): The activity key of the log.
            timestamp_key (str): The timestamp key of the log.
            config (Config): The configuration used for training and important hyperparameters.
        """
        self.model = model  # We assume there is an already existing model.
        self.case_id_key = case_id_key  # We assume the three important columns are known.
        self.activity_key = activity_key
        self.timestamp_key = timestamp_key
        self.config = config

        self.current_case_id = None
        self.paths = [] #: generated paths (multiple prediction generation)
        self.decoded_paths = [] #: decoded paths (multiple prediction generation)
        self.encoded_df = None 


        #: the followwing arrays are used for backtracking. 
        self.recursive_event_seqs = []  
        self.recursive_time_seqs = []
        self.recursive_time_diffs = []
        self.end_activities = {}


    def get_dummy_process(self, df, case_id_column):
        """
        just used for testing; create a dummy input df.
        """
        case_ids = df[case_id_column].unique()
        selected_id = -1
        length = 0
        for id in case_ids: 
            length = len(df[df[case_id_column]==id])
            if length>=self.config.seq_len:
                selected_id = id
                break
        if selected_id == -1: #: no sequence of appropiate length for input
            raise exceptions.SeqLengthTooHigh()             

        random_cut = random.randint(self.config.seq_len+1, length ) 
        dummy = df[df[case_id_column]==selected_id]

        return dummy.iloc[:random_cut]

    def check_input_uniqueness(self):
        """
        the input df must contain only one process. hence check if thereis one unique case_id 
        """
        return len(self.encoded_df[self.case_id_key].unique()) == 1

    def single_prediction_dataframe(self, df):
        """
        make one prediction given a dataframe. 
        preprocessor is in charge of doing the
        reading/importing from csv, xes, commandline, etc...
        """
        self.encoded_df = df

        #: check case id uniqueness
        if not self.check_input_uniqueness():
            raise exceptions.NotOneCaseId()
        self.current_case_id = self.encoded_df[self.case_id_key].sample(n = 1).values[0]
        return self.single_prediction()

    def single_prediction(self):
        """
        make one prediction given a partial process. 
        """
        #: sliding window

        step1 = RMTPP_torch.ATMDataset(self.config,self.encoded_df, self.case_id_key, self.timestamp_key, self.activity_key)
        #: just create one batch
        step2 = DataLoader(step1, batch_size=len(step1.time_seqs), shuffle=False, collate_fn=RMTPP_torch.ATMDataset.to_features)

        #: get the batch
        batch = next(iter(step2))
        event_pred, prob, time_pred= self.model.predict(batch, pm_active = True)

        return time_pred, event_pred, prob

    def jsonify_single(self, time_pred, event_pred, prob): 
        """
        note that we just save the
        probability of the last pair (time, event) in the path, 
        since the nn calculates lambda*(t) (see paper), which is 
        the probability of the last predicted event happening
        in the predicted time t. 
        """
        # decode event
        decoded_event = self.config.activity_le.inverse_transform([event_pred])

        # decode the timestamp
        timestamps = self.encoded_df[self.timestamp_key].copy()


        timestamps = timestamps*(10**(self.config.exponent))

        #: this -5 displacement is used to tackle the accuacy problem. it is set
        # arbitrarily.
        time_pred = time_pred*(10**(self.config.exponent-5)) 

        timestamps.iloc[-1]= timestamps.iloc[-1]+ time_pred
        timestamps = timestamps.astype("datetime64[ns]")
        new_time = timestamps.iloc[-1]

        ans = {
            "predicted_time":str(new_time), 
            "predicted_event":decoded_event[0], 
            "probability": prob
        }
        return json.dumps(ans)



    def get_differences(self):
        """
        calculates time differences. 
        """
        local = []
        for seq in self.recursive_time_seqs:
            seq = np.insert(seq, 0, seq[0])
            seq= np.diff(seq)
            local.append(seq)
        return np.array(local)


    def append_one_difference_array(self, lst):
        """
        Appends one difference array to self.recursive_time_diffs.

        Args:
            lst (list): List used for calculating the contiguous differences.
        """
        #: Extend the list by one element
        time = np.array([lst[0]]+ lst)
        #: Get the differences between contiguous elements.
        time = np.diff(time)
        self.recursive_time_diffs= np.append(self.recursive_time_diffs, [time], axis = 0)

    def multiple_prediction_linear(self, depth, nonstop ,upper): 
        """
        this is a special case of multiple prediction
        where the degree= 1. we avoid backtracking and recursion for
        efficiency reasons.  
        """
        #: get the event and timestamp of the last event in the partial process.
        c_t =self.encoded_df[self.timestamp_key].iloc[-1]
        c_e =self.encoded_df[self.activity_key].iloc[-1]

        #: compute sliding window
        self.recursive_atm = RMTPP_torch.ATMDataset(self.config, self.encoded_df, self.case_id_key, self.timestamp_key, self.activity_key, True)
        self.recursive_time_seqs = self.recursive_atm.time_seqs
        self.recursive_event_seqs = self.recursive_atm.event_seqs

        #: compute differences within each window
        self.recursive_time_diffs= self.get_differences()


        if nonstop:
            #: in case that we run until end activity
            self.linear_iterative_predictor_non_stop(c_t, c_e, upper)
        else:
            #: in case we run until a given depth
            self.linear_iterative_predictor(depth, c_t, c_e)


    def linear_iterative_predictor_non_stop(self, start_time, start_event, upper=float("inf")):
        """
        Predicts the path of events iteratively until an end activity is found or the upper bound is reached.

        Args:
            start_time (float): The start time of the path.
            start_event: The start event of the path.
            upper (float, optional): The upper bound for the amount of iterations. Defaults to float("inf").
        """
        c_t = start_time
        c_e = start_event
        path = [(c_t, (1, c_e))]
        i = 0
        while not self.end_activities[c_e] and i < upper:
            p_t, p_events = self.get_sorted_wrapper()
            p_pair = p_events[0]
            path.append((p_t[0], (p_pair[0], p_pair[1])))
            self.append_to_log(p_t[0], p_pair[1])
            c_t = p_t[0]
            c_e = p_pair[0]
            i += 1
        self.paths.append(path)


    def linear_iterative_predictor(self, depth, start_time, start_event): 
        """
        makes predictions linearly (ie no backtracking and branching degree = 1) , and also 
        iteratively (no recursion)
        """
        c_t = start_time
        c_e = start_event
        path = [(c_t , (1,c_e))]
        for i in range(depth): 
            p_t, p_events = self.get_sorted_wrapper()
            p_pair = p_events[0]
            path.append((p_t[0], (p_pair[0], p_pair[1] )))
            self.append_to_log(p_t[0], p_pair[1])
        self.paths.append(path)

    def multiple_prediction(self, depth, degree): 
            """
            Get a list of possible paths starting at the last timestamp and event pair.

            Args:
                depth (int): The number of steps in the future to be predicted.
                degree (int): The number of predictions on each step to be considered.


            This method loads data, gets windows, computes paths, and decodes paths.
            It requires the configuration used for the NN, which is required by the ATM Dataset.
            """
            c_t = self.encoded_df[self.timestamp_key].iloc[-1]
            c_e = self.encoded_df[self.activity_key].iloc[-1]

            #:load data, get windows
            self.recursive_atm = RMTPP_torch.ATMDataset(self.config, self.encoded_df, self.case_id_key, self.timestamp_key, self.activity_key, True)
            self.recursive_time_seqs = self.recursive_atm.time_seqs
            self.recursive_event_seqs = self.recursive_atm.event_seqs

            #:get differences
            self.recursive_time_diffs = self.get_differences()

            #: compute paths
            self.backtracking_prediction_tree(c_t, c_e, 0, depth, degree, [(c_t, (1, c_e))]) 

            #: decode paths
            self.decode_paths()


    def backtracking_prediction_tree(self, c_t, c_e, c_d, depth, degree,current_path):
        """
        use backtracking to generate all the paths from the given 
        last timestamp and marker considering the input degree as a threshold 
        and the maximum depth for the generated tree.
        """
        if c_d >= depth: 
            # base case
            self.paths.append(list(current_path))
            return
        p_t, p_events = self.get_sorted_wrapper( )
        for p_e in p_events[:degree]:
            # filter branching degree ; the list is already sorted
            # therefore the "degree" most probable are taken
            self.append_to_log(p_t[0], p_e[1]) 
            current_path.append((p_t[0], p_e))
            self.backtracking_prediction_tree(p_t[0], p_e[1], c_d+1, depth, degree, list(current_path))    
            current_path.pop() 
            self.pop_from_log()

    def get_sorted_wrapper(self):

        #: check whether the number of rows in 
        # self.encoded_df <= seq_len. otherwise the
        # processed data by dataloader/atmdataset 
        # output empty lists... 
        if self.config.seq_len>= len(self.encoded_df):
            raise exceptions.SeqLengthTooHigh()

        self.recursive_atm.event_seqs = self.recursive_event_seqs
        self.recursive_atm.time_seqs = self.recursive_time_seqs


        #: do not use dataloader for batch genertaion, too inefficient
        #: differnces are also computed in a smarter way, as well as windows.
        batch = ( torch.tensor(self.recursive_time_diffs,dtype=torch.float32), torch.tensor(self.recursive_event_seqs, dtype=torch.int64)) 

        pred_times, pred_events = [], []
        pred_time, pred_event = self.model.predict_sorted(batch)

        pred_times.append(pred_time)
        pred_events.append(pred_event)

        pred_times= pred_times[-1][-1] #we are only interested in the last one; unpack the batch
        pred_events = pred_events[-1][-1]

        return pred_times, pred_events


    def append_to_log(self, time, event): 
        """
        Appends a window and a difference array to an existing list instead of calling ATMDataset and Dataloader on each iterative call of the prediction generator.

        Args:
            time (float): The newly predicted timestamp.
            event: The newly predicted event.
        """
        last_time_seq = list(self.recursive_time_seqs[-1])
        last_event_seq = list(self.recursive_event_seqs[-1])
        new_time_seq = last_time_seq[1:]
        new_event_seq = last_event_seq[1:]
        new_time_seq.append(time)
        new_event_seq.append(event)
        self.recursive_time_seqs.append(new_time_seq)
        self.recursive_event_seqs.append(new_event_seq)
        self.append_one_difference_array(new_time_seq)

    def pop_from_log(self): 
        """
        used for backtracking to restore the old path
        """
        self.recursive_time_seqs.pop()
        self.recursive_event_seqs.pop()
        self.recursive_time_diffs= np.delete(self.recursive_time_diffs, len(self.recursive_time_diffs)-1, axis = 0) 

    def decode_paths(self):
        """
        used for decoding the events and timestamps in the generated paths. 
        The timestamps are NOT decoded, since the predictions are TIMEDELTAS
        """

        self.decoded_paths = []
        for path in self.paths: 
            encoded_events = [event_index for _, (_, event_index) in path]
            encoded_events = list(map(int, encoded_events))
            di = self.config.encoder_to_dict(self.config.activity_le)

            decoded_events = self.config.activity_le.inverse_transform(encoded_events)

            decoded_path= [(time, (prob, event)) for (time, (prob, _)), event in zip(path, decoded_events) ]
            self.decoded_paths.append(decoded_path)

    def jsonify_paths(self):
        """
        note that we just save the
        probability of the last pair (time, event) in the path, 
        since the nn calculates lambda*(t), which is 
        the probability of the last predicted event happening
        in the predicted time t. 

        paths markers are assumed to be decoded. 
        """
        ans = {
            "paths": []
        }
        for path in self.decoded_paths :
            current_path = {
                'pairs':[], 
                'percentage': str(path[-1][1][0])
            }
            first_time, (first_percentage, first_event) = path[0]
            rest = path[1:]

            print(first_time)
            first_time = first_time*(10**self.config.exponent)
            #: rescale the timedeltas


            #: the -5 displacement is used to tackle the accuacy problem. it is set
            rest_times = [math.ceil(time*(10**(self.config.exponent-5))) for time, (prob, event) in rest] 
            for i in range(1,len(rest_times)-1):  #:calculate cumsum
                rest_times[i] = rest_times[i]+ rest_times[i-1] 

            #: calculate the timestamp based on the first event 
            rest_times = [time+first_time for time in rest_times]
            times= [first_time] + rest_times
            temp_df = pd.DataFrame({"times": times})
            temp_df["times"] = temp_df["times"].astype("datetime64[ns]") #: a dataframe is used because we want to revese the encoding 
            # with the same algorithm (in thie case provided by pandas)
            times_decoded = temp_df["times"].tolist()


            for path_element, decoded_time in zip(path, times_decoded):
                current_path["pairs"].append(
                    {
                        "time":str(decoded_time) , 
                        "event": str(path_element[1][1]), 
                    }
                )
            ans["paths"].append(current_path)
        return ans

    def multiple_prediction_dataframe(self, depth, degree, df, linear=False, non_stop = False, upper = 30):
        """
        make multiple predictions given a dataframe
        preprocessor is in charge of doing the
        reading/importing from csv, xes, commandline, etc...
        it is assumed that the event log contains only one case id.
        """

        """
        preprocessor = preprocessing.Preprocessing()

        #; its important to pass the existing encoders to avoid recomputation of the language encoders!

        preprocessor.import_event_log_dataframe(df, self.case_id_key, self.activity_key, self.timestamp_key)

        self.encoded_df= preprocessor.event_df 
        """
        self.encoded_df = df
        self.current_case_id= self.encoded_df[self.case_id_key].sample(n = 1).values[0]


        di = self.config.encoder_to_dict(self.config.activity_le)
        if not linear: 
            """
            case backtracking needed; never called by prediction manager.  
            """
            self.multiple_prediction(depth, degree)
        else: 
            """
            the linear version is only being used for
            the iterative use of the multiple prediction generation.
            it mostly contains optimizations that are only 
            tought for the prediction manager. 
            """
            self.multiple_prediction_linear(depth, non_stop, upper)

`init(model, case_id_key, activity_key, timestamp_key, config)`

Initializes the PredictionManager object.

Parameters:

Name	Type	Description	Default
`model`	`object`	The model used for doing predictions.	required
`case_id_key`	`str`	The case id key of the log.	required
`activity_key`	`str`	The activity key of the log.	required
`timestamp_key`	`str`	The timestamp key of the log.	required
`config`	`Config`	The configuration used for training and important hyperparameters.	required

Source code in server/prediction_manager.py

def __init__(self, model, case_id_key, activity_key, timestamp_key, config):
    """
    Initializes the PredictionManager object.

    Args:
        model (object): The model used for doing predictions.
        case_id_key (str): The case id key of the log.
        activity_key (str): The activity key of the log.
        timestamp_key (str): The timestamp key of the log.
        config (Config): The configuration used for training and important hyperparameters.
    """
    self.model = model  # We assume there is an already existing model.
    self.case_id_key = case_id_key  # We assume the three important columns are known.
    self.activity_key = activity_key
    self.timestamp_key = timestamp_key
    self.config = config

    self.current_case_id = None
    self.paths = [] #: generated paths (multiple prediction generation)
    self.decoded_paths = [] #: decoded paths (multiple prediction generation)
    self.encoded_df = None 


    #: the followwing arrays are used for backtracking. 
    self.recursive_event_seqs = []  
    self.recursive_time_seqs = []
    self.recursive_time_diffs = []
    self.end_activities = {}

`append_one_difference_array(lst)`

Appends one difference array to self.recursive_time_diffs.

Parameters:

Name	Type	Description	Default
`lst`	`list`	List used for calculating the contiguous differences.	required

Source code in server/prediction_manager.py

def append_one_difference_array(self, lst):
    """
    Appends one difference array to self.recursive_time_diffs.

    Args:
        lst (list): List used for calculating the contiguous differences.
    """
    #: Extend the list by one element
    time = np.array([lst[0]]+ lst)
    #: Get the differences between contiguous elements.
    time = np.diff(time)
    self.recursive_time_diffs= np.append(self.recursive_time_diffs, [time], axis = 0)

`append_to_log(time, event)`

Appends a window and a difference array to an existing list instead of calling ATMDataset and Dataloader on each iterative call of the prediction generator.

Parameters:

Name	Type	Description	Default
`time`	`float`	The newly predicted timestamp.	required
`event`		The newly predicted event.	required

Source code in server/prediction_manager.py

def append_to_log(self, time, event): 
    """
    Appends a window and a difference array to an existing list instead of calling ATMDataset and Dataloader on each iterative call of the prediction generator.

    Args:
        time (float): The newly predicted timestamp.
        event: The newly predicted event.
    """
    last_time_seq = list(self.recursive_time_seqs[-1])
    last_event_seq = list(self.recursive_event_seqs[-1])
    new_time_seq = last_time_seq[1:]
    new_event_seq = last_event_seq[1:]
    new_time_seq.append(time)
    new_event_seq.append(event)
    self.recursive_time_seqs.append(new_time_seq)
    self.recursive_event_seqs.append(new_event_seq)
    self.append_one_difference_array(new_time_seq)

`backtracking_prediction_tree(c_t, c_e, c_d, depth, degree, current_path)`

use backtracking to generate all the paths from the given last timestamp and marker considering the input degree as a threshold and the maximum depth for the generated tree.

Source code in server/prediction_manager.py

def backtracking_prediction_tree(self, c_t, c_e, c_d, depth, degree,current_path):
    """
    use backtracking to generate all the paths from the given 
    last timestamp and marker considering the input degree as a threshold 
    and the maximum depth for the generated tree.
    """
    if c_d >= depth: 
        # base case
        self.paths.append(list(current_path))
        return
    p_t, p_events = self.get_sorted_wrapper( )
    for p_e in p_events[:degree]:
        # filter branching degree ; the list is already sorted
        # therefore the "degree" most probable are taken
        self.append_to_log(p_t[0], p_e[1]) 
        current_path.append((p_t[0], p_e))
        self.backtracking_prediction_tree(p_t[0], p_e[1], c_d+1, depth, degree, list(current_path))    
        current_path.pop() 
        self.pop_from_log()

`check_input_uniqueness()`

the input df must contain only one process. hence check if thereis one unique case_id

Source code in server/prediction_manager.py

def check_input_uniqueness(self):
    """
    the input df must contain only one process. hence check if thereis one unique case_id 
    """
    return len(self.encoded_df[self.case_id_key].unique()) == 1

`decode_paths()`

used for decoding the events and timestamps in the generated paths. The timestamps are NOT decoded, since the predictions are TIMEDELTAS

Source code in server/prediction_manager.py

def decode_paths(self):
    """
    used for decoding the events and timestamps in the generated paths. 
    The timestamps are NOT decoded, since the predictions are TIMEDELTAS
    """

    self.decoded_paths = []
    for path in self.paths: 
        encoded_events = [event_index for _, (_, event_index) in path]
        encoded_events = list(map(int, encoded_events))
        di = self.config.encoder_to_dict(self.config.activity_le)

        decoded_events = self.config.activity_le.inverse_transform(encoded_events)

        decoded_path= [(time, (prob, event)) for (time, (prob, _)), event in zip(path, decoded_events) ]
        self.decoded_paths.append(decoded_path)

`get_differences()`

calculates time differences.

Source code in server/prediction_manager.py

def get_differences(self):
    """
    calculates time differences. 
    """
    local = []
    for seq in self.recursive_time_seqs:
        seq = np.insert(seq, 0, seq[0])
        seq= np.diff(seq)
        local.append(seq)
    return np.array(local)

`get_dummy_process(df, case_id_column)`

just used for testing; create a dummy input df.

Source code in server/prediction_manager.py

def get_dummy_process(self, df, case_id_column):
    """
    just used for testing; create a dummy input df.
    """
    case_ids = df[case_id_column].unique()
    selected_id = -1
    length = 0
    for id in case_ids: 
        length = len(df[df[case_id_column]==id])
        if length>=self.config.seq_len:
            selected_id = id
            break
    if selected_id == -1: #: no sequence of appropiate length for input
        raise exceptions.SeqLengthTooHigh()             

    random_cut = random.randint(self.config.seq_len+1, length ) 
    dummy = df[df[case_id_column]==selected_id]

    return dummy.iloc[:random_cut]

`jsonify_paths()`

note that we just save the probability of the last pair (time, event) in the path, since the nn calculates lambda*(t), which is the probability of the last predicted event happening in the predicted time t.

paths markers are assumed to be decoded.

Source code in server/prediction_manager.py

def jsonify_paths(self):
    """
    note that we just save the
    probability of the last pair (time, event) in the path, 
    since the nn calculates lambda*(t), which is 
    the probability of the last predicted event happening
    in the predicted time t. 

    paths markers are assumed to be decoded. 
    """
    ans = {
        "paths": []
    }
    for path in self.decoded_paths :
        current_path = {
            'pairs':[], 
            'percentage': str(path[-1][1][0])
        }
        first_time, (first_percentage, first_event) = path[0]
        rest = path[1:]

        print(first_time)
        first_time = first_time*(10**self.config.exponent)
        #: rescale the timedeltas


        #: the -5 displacement is used to tackle the accuacy problem. it is set
        rest_times = [math.ceil(time*(10**(self.config.exponent-5))) for time, (prob, event) in rest] 
        for i in range(1,len(rest_times)-1):  #:calculate cumsum
            rest_times[i] = rest_times[i]+ rest_times[i-1] 

        #: calculate the timestamp based on the first event 
        rest_times = [time+first_time for time in rest_times]
        times= [first_time] + rest_times
        temp_df = pd.DataFrame({"times": times})
        temp_df["times"] = temp_df["times"].astype("datetime64[ns]") #: a dataframe is used because we want to revese the encoding 
        # with the same algorithm (in thie case provided by pandas)
        times_decoded = temp_df["times"].tolist()


        for path_element, decoded_time in zip(path, times_decoded):
            current_path["pairs"].append(
                {
                    "time":str(decoded_time) , 
                    "event": str(path_element[1][1]), 
                }
            )
        ans["paths"].append(current_path)
    return ans

`jsonify_single(time_pred, event_pred, prob)`

note that we just save the probability of the last pair (time, event) in the path, since the nn calculates lambda*(t) (see paper), which is the probability of the last predicted event happening in the predicted time t.

Source code in server/prediction_manager.py

def jsonify_single(self, time_pred, event_pred, prob): 
    """
    note that we just save the
    probability of the last pair (time, event) in the path, 
    since the nn calculates lambda*(t) (see paper), which is 
    the probability of the last predicted event happening
    in the predicted time t. 
    """
    # decode event
    decoded_event = self.config.activity_le.inverse_transform([event_pred])

    # decode the timestamp
    timestamps = self.encoded_df[self.timestamp_key].copy()


    timestamps = timestamps*(10**(self.config.exponent))

    #: this -5 displacement is used to tackle the accuacy problem. it is set
    # arbitrarily.
    time_pred = time_pred*(10**(self.config.exponent-5)) 

    timestamps.iloc[-1]= timestamps.iloc[-1]+ time_pred
    timestamps = timestamps.astype("datetime64[ns]")
    new_time = timestamps.iloc[-1]

    ans = {
        "predicted_time":str(new_time), 
        "predicted_event":decoded_event[0], 
        "probability": prob
    }
    return json.dumps(ans)

`linear_iterative_predictor(depth, start_time, start_event)`

makes predictions linearly (ie no backtracking and branching degree = 1) , and also iteratively (no recursion)

Source code in server/prediction_manager.py

def linear_iterative_predictor(self, depth, start_time, start_event): 
    """
    makes predictions linearly (ie no backtracking and branching degree = 1) , and also 
    iteratively (no recursion)
    """
    c_t = start_time
    c_e = start_event
    path = [(c_t , (1,c_e))]
    for i in range(depth): 
        p_t, p_events = self.get_sorted_wrapper()
        p_pair = p_events[0]
        path.append((p_t[0], (p_pair[0], p_pair[1] )))
        self.append_to_log(p_t[0], p_pair[1])
    self.paths.append(path)

`linear_iterative_predictor_non_stop(start_time, start_event, upper=float('inf'))`

Predicts the path of events iteratively until an end activity is found or the upper bound is reached.

Parameters:

Name	Type	Description	Default
`start_time`	`float`	The start time of the path.	required
`start_event`		The start event of the path.	required
`upper`	`float`	The upper bound for the amount of iterations. Defaults to float("inf").	`float('inf')`

Source code in server/prediction_manager.py

def linear_iterative_predictor_non_stop(self, start_time, start_event, upper=float("inf")):
    """
    Predicts the path of events iteratively until an end activity is found or the upper bound is reached.

    Args:
        start_time (float): The start time of the path.
        start_event: The start event of the path.
        upper (float, optional): The upper bound for the amount of iterations. Defaults to float("inf").
    """
    c_t = start_time
    c_e = start_event
    path = [(c_t, (1, c_e))]
    i = 0
    while not self.end_activities[c_e] and i < upper:
        p_t, p_events = self.get_sorted_wrapper()
        p_pair = p_events[0]
        path.append((p_t[0], (p_pair[0], p_pair[1])))
        self.append_to_log(p_t[0], p_pair[1])
        c_t = p_t[0]
        c_e = p_pair[0]
        i += 1
    self.paths.append(path)

`multiple_prediction(depth, degree)`

Get a list of possible paths starting at the last timestamp and event pair.

Parameters:

Name	Type	Description	Default
`depth`	`int`	The number of steps in the future to be predicted.	required
`degree`	`int`	The number of predictions on each step to be considered.	required

This method loads data, gets windows, computes paths, and decodes paths. It requires the configuration used for the NN, which is required by the ATM Dataset.

Source code in server/prediction_manager.py

def multiple_prediction(self, depth, degree): 
        """
        Get a list of possible paths starting at the last timestamp and event pair.

        Args:
            depth (int): The number of steps in the future to be predicted.
            degree (int): The number of predictions on each step to be considered.


        This method loads data, gets windows, computes paths, and decodes paths.
        It requires the configuration used for the NN, which is required by the ATM Dataset.
        """
        c_t = self.encoded_df[self.timestamp_key].iloc[-1]
        c_e = self.encoded_df[self.activity_key].iloc[-1]

        #:load data, get windows
        self.recursive_atm = RMTPP_torch.ATMDataset(self.config, self.encoded_df, self.case_id_key, self.timestamp_key, self.activity_key, True)
        self.recursive_time_seqs = self.recursive_atm.time_seqs
        self.recursive_event_seqs = self.recursive_atm.event_seqs

        #:get differences
        self.recursive_time_diffs = self.get_differences()

        #: compute paths
        self.backtracking_prediction_tree(c_t, c_e, 0, depth, degree, [(c_t, (1, c_e))]) 

        #: decode paths
        self.decode_paths()

`multiple_prediction_dataframe(depth, degree, df, linear=False, non_stop=False, upper=30)`

make multiple predictions given a dataframe preprocessor is in charge of doing the reading/importing from csv, xes, commandline, etc... it is assumed that the event log contains only one case id.

Source code in server/prediction_manager.py

def multiple_prediction_dataframe(self, depth, degree, df, linear=False, non_stop = False, upper = 30):
    """
    make multiple predictions given a dataframe
    preprocessor is in charge of doing the
    reading/importing from csv, xes, commandline, etc...
    it is assumed that the event log contains only one case id.
    """

    """
    preprocessor = preprocessing.Preprocessing()

    #; its important to pass the existing encoders to avoid recomputation of the language encoders!

    preprocessor.import_event_log_dataframe(df, self.case_id_key, self.activity_key, self.timestamp_key)

    self.encoded_df= preprocessor.event_df 
    """
    self.encoded_df = df
    self.current_case_id= self.encoded_df[self.case_id_key].sample(n = 1).values[0]


    di = self.config.encoder_to_dict(self.config.activity_le)
    if not linear: 
        """
        case backtracking needed; never called by prediction manager.  
        """
        self.multiple_prediction(depth, degree)
    else: 
        """
        the linear version is only being used for
        the iterative use of the multiple prediction generation.
        it mostly contains optimizations that are only 
        tought for the prediction manager. 
        """
        self.multiple_prediction_linear(depth, non_stop, upper)

`multiple_prediction_linear(depth, nonstop, upper)`

this is a special case of multiple prediction where the degree= 1. we avoid backtracking and recursion for efficiency reasons.

Source code in server/prediction_manager.py

def multiple_prediction_linear(self, depth, nonstop ,upper): 
    """
    this is a special case of multiple prediction
    where the degree= 1. we avoid backtracking and recursion for
    efficiency reasons.  
    """
    #: get the event and timestamp of the last event in the partial process.
    c_t =self.encoded_df[self.timestamp_key].iloc[-1]
    c_e =self.encoded_df[self.activity_key].iloc[-1]

    #: compute sliding window
    self.recursive_atm = RMTPP_torch.ATMDataset(self.config, self.encoded_df, self.case_id_key, self.timestamp_key, self.activity_key, True)
    self.recursive_time_seqs = self.recursive_atm.time_seqs
    self.recursive_event_seqs = self.recursive_atm.event_seqs

    #: compute differences within each window
    self.recursive_time_diffs= self.get_differences()


    if nonstop:
        #: in case that we run until end activity
        self.linear_iterative_predictor_non_stop(c_t, c_e, upper)
    else:
        #: in case we run until a given depth
        self.linear_iterative_predictor(depth, c_t, c_e)

`pop_from_log()`

used for backtracking to restore the old path

Source code in server/prediction_manager.py

def pop_from_log(self): 
    """
    used for backtracking to restore the old path
    """
    self.recursive_time_seqs.pop()
    self.recursive_event_seqs.pop()
    self.recursive_time_diffs= np.delete(self.recursive_time_diffs, len(self.recursive_time_diffs)-1, axis = 0) 

`single_prediction()`

make one prediction given a partial process.

Source code in server/prediction_manager.py

def single_prediction(self):
    """
    make one prediction given a partial process. 
    """
    #: sliding window

    step1 = RMTPP_torch.ATMDataset(self.config,self.encoded_df, self.case_id_key, self.timestamp_key, self.activity_key)
    #: just create one batch
    step2 = DataLoader(step1, batch_size=len(step1.time_seqs), shuffle=False, collate_fn=RMTPP_torch.ATMDataset.to_features)

    #: get the batch
    batch = next(iter(step2))
    event_pred, prob, time_pred= self.model.predict(batch, pm_active = True)

    return time_pred, event_pred, prob

`single_prediction_dataframe(df)`

make one prediction given a dataframe. preprocessor is in charge of doing the reading/importing from csv, xes, commandline, etc...

Source code in server/prediction_manager.py

def single_prediction_dataframe(self, df):
    """
    make one prediction given a dataframe. 
    preprocessor is in charge of doing the
    reading/importing from csv, xes, commandline, etc...
    """
    self.encoded_df = df

    #: check case id uniqueness
    if not self.check_input_uniqueness():
        raise exceptions.NotOneCaseId()
    self.current_case_id = self.encoded_df[self.case_id_key].sample(n = 1).values[0]
    return self.single_prediction()

Process model manager module

This module implements all necessary functions for conformance checking and fitness analysis.

Functions:

Name	Description
`- cut_event_log_tail`	Cuts each case in the event log from the tail.
`- cut_event_log_random`	Cuts each case in the event log at random indices.
`- reconstruct_event_log`	Reconstructs the event log using the prediction manager.
`- process_mining`	Applies a process mining algorithm to the reconstructed event log.
`- conformance_checking_token_based`	Performs token-based conformance checking on the reconstructed event log.
`- conformance_checking_alignment_based`	Performs alignment-based conformance checking on the reconstructed event log.
`- import_petri_net`	Imports a Petri net.
`- export_petri_net`	Exports a Petri net.
`- decode_predictions`	Decodes the predictions in the event log.

This module allows for the analysis of fitness by cutting the event log, reconstructing it using predictions, and applying process mining and conformance checking algorithms.

`ProcessModelManager`

Source code in server/process_model_manager.py

class ProcessModelManager:
    def __init__(self,event_df, model, config, case_activity_key, case_id_key, case_timestamp_key  ):
        #: it is assumed that the model already exists.
        self.model =model  #genertaed model
        self.case_activity_key = case_activity_key
        self.case_id_key = case_id_key
        self.case_timestamp_key= case_timestamp_key 
        self.config =config  #config of the nn
        self.event_df =event_df  #the df use for preprocessing and training 

        self.end_activities = {}
        self.predictive_df= None
        # variables to store PM model for further Conformance checking
        self.initial_marking = None
        self.unencoded_df = None #used for conformance checking
        self.final_marking = None
        self.petri_net = None


    def initialize_variables(self):
        """
        initialize variabels for predictive log generator
        """
        case_id_counts = self.event_df[self.case_id_key].value_counts()
        cuts = []
        self.predictive_df = {
            self.case_id_key:[],
            self.case_activity_key:[],
            self.case_timestamp_key:[]
        }
        input_sequences = []
        cuts = {}
        return case_id_counts, cuts, input_sequences, cuts



    def tail_cutter(self, case_id_counts, cut_length, cuts, input_sequences):
        """
        cut sequences cut_length steps from the tail.
        :param cut_length: how many steps to cut from the tail of each sequence. 
        :param case_id_counts: number of steps on each case_id
        :param input_sequences: list of sequences to be cut. 

        Side effect: the predictive_df is extended with the cut sequences.
        """
        for case_id in case_id_counts.index:
            count = case_id_counts.loc[case_id]
            cut = random.randint(1, count)
            cut = count-min(cut_length, cut)
            sequence = self.event_df[self.event_df[self.case_id_key]==case_id]
            sequence = sequence.iloc[:cut]
            if len(sequence) <= self.config.seq_len:
                continue
            cuts[case_id]= (count, cut, count-cut)
            input_sequences.append(sequence)

            self.predictive_df= pd.concat([self.predictive_df, sequence], ignore_index = True)

        return case_id_counts, cuts, input_sequences

    def random_cutter(self, case_id_counts, max_len, cuts, input_sequences):
        """
        Cuts each sequence contained in input_sequences at random indices.

        Args:
            cuts (dict): The cut index and cut length are preserved.
            case_id_counts (pd.Series): Number of rows for each case_id.
            max_len (int): Max length that the input sequence can have. Can be set to improve runtime.
                   TODO: allow INF for max_len.
            input_sequences (list): List of sequences to be cut.
        """
        for i, case_id in enumerate(case_id_counts.index):
            count = case_id_counts.loc[case_id]
            sequence = self.event_df[self.event_df[self.case_id_key]==case_id] 
            if count<=self.config.seq_len or count>max_len:
                continue
            cut = random.randint(self.config.seq_len+1, count)
            sequence = sequence.iloc[:cut]
            cuts[case_id]= (count, cut, count-cut)
            input_sequences.append(sequence)
            self.predictive_df= pd.concat([self.predictive_df, sequence], ignore_index = True)
        return case_id_counts, cuts, input_sequences 

    def fill_up_log(self, upper , non_stop , random_cuts , cut_length , input_sequences, cuts):
        """
        do the predictions for each cut sequence and extend the event log so that 
        it now constains the predictions. 
        """
        #: initialize prediction manager.
        pm=prediction_manager.PredictionManager(
            self.model, 
            self.case_id_key, 
            self.case_activity_key, 
            self.case_timestamp_key, 
            self.config
        )
        pm.end_activities = self.end_activities

        for i, sequence in enumerate(tqdm(input_sequences)): 
            case_id = sequence[self.case_id_key].iloc[1]
            #: do the predictions in the corresponding mode
            pm.multiple_prediction_dataframe(
                cuts[case_id][2],
                1,
                sequence,
                linear = True,  
                non_stop=non_stop,
                upper = upper,
            )
            prediction = pm.paths[0] 
            extension = {
                self.case_id_key:[],
                self.case_activity_key:[],
                self.case_timestamp_key:[]
            }
            #: arrange the predictions in the extension dictionary
            # the first prediction is not used because it is just information 
            # use for knowing what was the last timestamp recorded
            for time, (pred, event) in prediction[1:]:
                extension[self.case_id_key] = [case_id]
                extension[self.case_activity_key]= [event]
                extension[self.case_timestamp_key]= [time]

            extension = pd.DataFrame(extension)
            #: here compute cumulative sum of times + last timestamp recorded



            extension[self.case_timestamp_key] = extension[self.case_timestamp_key]*self.config.exponent
            extension[self.case_timestamp_key] = extension[self.case_timestamp_key].cumsum()
            extension[self.case_timestamp_key] = extension[self.case_timestamp_key] + prediction[0][0]

            #: transform extension to dtaframe and extend the predictive df now with the predictions
            self.predictive_df= pd.concat([self.predictive_df, extension], ignore_index = True)

        self.predictive_df=  self.predictive_df.sort_values(by=[self.case_id_key, self.case_timestamp_key])
        self.predictive_df = self.decode_df(self.predictive_df)

    def generate_predictive_log(self, new_log_path, max_len= 15, upper = 30, non_stop = False, random_cuts = False, cut_length = 0): 
        """
        generates a predictive log. each process is cut at some given index, and the model is used to 
        reconstruct the rest of the process. there are so far three possible modi for cutting and prediction generation:  
        - for tail cuts: set cut_length value and set random_cuts to false
        - for random cuts with cut memory: random_cuts to true and non_stop to false
        - for random cuts nonstop: random_cuts to true and non_stop totrue 

        Args:
            max len: max length for the cut sequences ie max sequence input size length.
            upper:  upperbound for the non stop random cutter ie how long to run before reaching end state. 
            non_stop: must be set to true if the predictions are done until reaching final marking.
            random_cuts: set to true to cut in random indices. 
            cut_length: in case of cutting fix tail lengths, select the tail length to cut for all sequences.
            upper: upper bound for how many iterations a non stop iterative predictor should run.
        """

        case_id_counts, cuts, input_sequences, cuts = self.initialize_variables()

        self.predictive_df = pd.DataFrame(self.predictive_df)
        if random_cuts: 
            case_id_counts,cuts, input_sequences= self.random_cutter(case_id_counts, max_len,cuts,  input_sequences)
        else: 
            if cut_length ==0: 
                raise exceptions.CutLengthZero()
            case_id_counts,cuts, input_sequences= self.tail_cutter(case_id_counts, cut_length,cuts,  input_sequences)


        self.check_too_short(input_sequences)

        self.fill_up_log( upper , non_stop , random_cuts , cut_length , input_sequences, cuts)

        self.predictive_df.to_csv(new_log_path, sep = ",")

    def check_too_short(self, sequences):
        lenths = [len(seq) for seq in sequences]
        for i in lenths: 
            if i<=self.config.seq_len: 
                print("found too short sequence")
                raise exceptions.CutTooLarge()

    def decode_sequence(self, sequence):
        """
        decodes the input sequence that contains a df.  
        :return: sequence that has been decoded. 
        """
        sequence[self.case_activity_key] = self.config.activity_le.inverse_transform(sequence[self.case_activity_key].astype(int))
        return sequence

    def handle_nat(self, group):
        """
        the inverse transformation for timestamps is a lossy transformation and might lead to NaT entries. 
        a timedelta of k second's with respect to the last valid timestamp is set as a timestamp value for 
        the kth NaT entry.
        :param group: a group in the predictive df that contains only one case id. 
        :return: the same group now with valid timestamps
        """
        last_valid_idx = group[self.case_timestamp_key].last_valid_index()
        if last_valid_idx is None:
            return group
        print("found nan")
        last_valid_timestamp= group.loc[last_valid_idx, self.case_timestamp_key]

        nat_indices = group.index[group[self.case_timestamp_key].isna()]
        for i, idx in enumerate(nat_indices):
            group.at[idx, self.case_timestamp_key] = last_valid_timestamp+ pd.Timedelta(days=i + 1)

        return group     

    def decode_df(self, df):
        """
        decodes the predictive df; inverse transform timestamps and event names.
        """

        df[self.case_activity_key] = df[self.case_activity_key].astype("int")
        df[self.case_id_key] = df[self.case_id_key].astype("int")
        df[self.case_activity_key] = self.config.activity_le.inverse_transform(df[self.case_activity_key])
        df[self.case_id_key] = self.config.case_id_le.inverse_transform(df[self.case_id_key])
        df[self.case_activity_key] = df[self.case_activity_key].astype("str")
        df[self.case_id_key] = df[self.case_id_key].astype("str")
        #: note that this operation is lossy and might generate NaT. 

        df[self.case_timestamp_key] = df[self.case_timestamp_key]*(10**self.config.exponent)

        df[self.case_activity_key] = df[self.case_activity_key].astype("str")
        df[self.case_id_key] = df[self.case_id_key].astype("str")

        df[self.case_timestamp_key] = df[self.case_timestamp_key].astype("datetime64[ns, UTC]")

        #: handle NaT values
        df= df.groupby(self.case_id_key, group_keys=False).apply(self.handle_nat)
        #: just in case 
        df = df.dropna() 
        #: save the generated predictive model
        return df

    def import_predictive_df(self, path):
        """
        used for importing a predictive df. 
        """
        self.predictive_df = pd.read_csv(path, sep = ",")

    def visualize(self):
        #: this way it can be accessed from outside the class.
        pm4py.view_petri_net(self.petri_net, self.initial_marking, self.final_marking, format='svg')

    def heuristic_miner(self,path,  dependency_threshold=0.5, and_threshold=0.65, loop_two_threshold=0.5, view= False):
        """
        Run heuristic miner on the predictive log and generate a petri net.

        Args:
            path (str): Path used for saving the generated petri net.
            dependency_threshold (float): Dependency threshold parameter for heuristic miner.
            and_threshold (float): AND threshold parameter for heuristic miner.
            loop_two_threshold (float): Loop two threshold parameter for heuristic miner.
        """
        self.format_columns()
        self.petri_net, self.initial_marking, self.final_marking = pm4py.discover_petri_net_heuristics(
            self.predictive_df,
            dependency_threshold, 
            and_threshold, 
            loop_two_threshold, 
            activity_key=self.case_activity_key,
            timestamp_key=self.case_timestamp_key,
            case_id_key= self.case_id_key
        )
        #: export the petri net in the given path
        pm4py.write_pnml(self.petri_net,self.initial_marking, self.final_marking, file_path=path)
        pm4py.save_vis_petri_net(self.petri_net, self.initial_marking, self.final_marking, file_path = path+".png")

    def format_columns(self): 
        """
        exporting to csv changes the datetime types to object, but we need them to be 
        datetime.  
        """
        self.predictive_df[self.case_timestamp_key] = self.predictive_df[self.case_timestamp_key].astype("datetime64[ns, UTC]")
        self.predictive_df[self.case_id_key] = self.predictive_df[self.case_id_key].astype("str")
        #self.predictive_df[self.case_activity_key] = self.config.activity_le.inverse_transform(self.predictive_df[self.case_activity_key].astype(int))
        self.predictive_df[self.case_activity_key] = self.predictive_df[self.case_activity_key].astype("str")

    def inductive_miner(self, path,   noise_threshold=0):
        """
        Run inductive miner on the predictive log and generate a petri net.

        Args:
            path (str): Path used for saving the generated petri net.
            noise_threshold (float): Noise threshold parameter for inductive miner.
        """
        self.format_columns()
        self.petri_net, self.initial_marking, self.final_marking = pm4py.discover_petri_net_inductive(
            self.predictive_df,
            noise_threshold, 
            self.case_activity_key,
            self.case_timestamp_key,
            self.case_id_key
        )
        #pm4py.view_petri_net(self.petri_net, self.initial_marking, self.final_marking, format='svg')
        pm4py.write_pnml(self.petri_net,self.initial_marking, self.final_marking, file_path=path)
        pm4py.save_vis_petri_net(self.petri_net, self.initial_marking, self.final_marking, file_path = path+".png")

    def alpha_miner(self, path):
        """
        Run alpha miner on the predictive log and generate a petri net.

        Args:
            path (str): Path used for saving the generated petri net.
        """
        self.format_columns()
        self.petri_net, self.initial_marking, self.final_marking = pm4py.discover_petri_net_alpha(
            self.predictive_df,
            self.case_activity_key,
            self.case_timestamp_key, 
            self.case_id_key
        )
        #pm4py.view_petri_net(self.petri_net, self.initial_marking, self.final_marking, format='svg')
        pm4py.write_pnml(self.petri_net,self.initial_marking, self.final_marking , file_path=path)
        pm4py.save_vis_petri_net(self.petri_net, self.initial_marking, self.final_marking, file_path = path+".png")

    def prefix_tree_miner(self, path):
        """
        Run prefix tree miner on the predictive log and generate a petri net.

        Args:
            path (str): Path used for saving the generated petri net.
        """
        self.format_columns()
        self.petri_net, self.initial_marking, self.final_marking = pm4py.discover_prefix_tree(
            self.predictive_df,
            self.case_activity_key,
            self.case_timestamp_key,
            self.case_id_key
        )
        #pm4py.view_petri_net(self.petri_net, self.initial_marking, self.final_marking, format='svg')
        pm4py.write_pnml(self.petri_net,self.initial_marking, self.final_marking , file_path=path)
        pm4py.save_vis_petri_net(self.petri_net, self.initial_marking, self.final_marking, file_path = path+".png")


    def conformance_checking_token_based_replay(self):
        replayed_traces = pm4py.conformance_diagnostics_token_based_replay(
            self.unencoded_df,  self.petri_net, self.initial_marking, self.final_marking)
        return self.compute_fitness(replayed_traces)

    def compute_fitness(self, replayed_traces):
        sum_m =  0
        sum_c = 0
        sum_r = 0
        sum_p  = 0
        # TODO: multiply by trace frequency in the log (there is no such function in pm4py)
        for trace in replayed_traces: 
            sum_m+= 1*trace["missing_tokens"]
            sum_c+= 1*trace["consumed_tokens"]
            sum_r += 1*trace["remaining_tokens"]
            sum_p += 1*trace["produced_tokens"]

        return 0.5*(1-(sum_m/sum_c)) + 0.5*(1-(sum_r/sum_p))

    def conformance_checking_alignments(self):
        aligned_traces = pm4py.conformance_diagnostics_alignments(self.unencoded_df, self.petri_net, self.initial_marking, self.final_marking)
        log_fitness = replay_fitness.evaluate(aligned_traces, variant=replay_fitness.Variants.ALIGNMENT_BASED)
        return self.compute_fitness(log_fitness)

    def load_petri_net(self, path): 
        self.petri_net, self.initial_marking, self.final_marking = pm4py.read_pnml(path)

`alpha_miner(path)`

Run alpha miner on the predictive log and generate a petri net.

Parameters:

Name	Type	Description	Default
`path`	`str`	Path used for saving the generated petri net.	required

Source code in server/process_model_manager.py

def alpha_miner(self, path):
    """
    Run alpha miner on the predictive log and generate a petri net.

    Args:
        path (str): Path used for saving the generated petri net.
    """
    self.format_columns()
    self.petri_net, self.initial_marking, self.final_marking = pm4py.discover_petri_net_alpha(
        self.predictive_df,
        self.case_activity_key,
        self.case_timestamp_key, 
        self.case_id_key
    )
    #pm4py.view_petri_net(self.petri_net, self.initial_marking, self.final_marking, format='svg')
    pm4py.write_pnml(self.petri_net,self.initial_marking, self.final_marking , file_path=path)
    pm4py.save_vis_petri_net(self.petri_net, self.initial_marking, self.final_marking, file_path = path+".png")

`decode_df(df)`

decodes the predictive df; inverse transform timestamps and event names.

Source code in server/process_model_manager.py

def decode_df(self, df):
    """
    decodes the predictive df; inverse transform timestamps and event names.
    """

    df[self.case_activity_key] = df[self.case_activity_key].astype("int")
    df[self.case_id_key] = df[self.case_id_key].astype("int")
    df[self.case_activity_key] = self.config.activity_le.inverse_transform(df[self.case_activity_key])
    df[self.case_id_key] = self.config.case_id_le.inverse_transform(df[self.case_id_key])
    df[self.case_activity_key] = df[self.case_activity_key].astype("str")
    df[self.case_id_key] = df[self.case_id_key].astype("str")
    #: note that this operation is lossy and might generate NaT. 

    df[self.case_timestamp_key] = df[self.case_timestamp_key]*(10**self.config.exponent)

    df[self.case_activity_key] = df[self.case_activity_key].astype("str")
    df[self.case_id_key] = df[self.case_id_key].astype("str")

    df[self.case_timestamp_key] = df[self.case_timestamp_key].astype("datetime64[ns, UTC]")

    #: handle NaT values
    df= df.groupby(self.case_id_key, group_keys=False).apply(self.handle_nat)
    #: just in case 
    df = df.dropna() 
    #: save the generated predictive model
    return df

`decode_sequence(sequence)`

decodes the input sequence that contains a df.
:return: sequence that has been decoded.

Source code in server/process_model_manager.py

def decode_sequence(self, sequence):
    """
    decodes the input sequence that contains a df.  
    :return: sequence that has been decoded. 
    """
    sequence[self.case_activity_key] = self.config.activity_le.inverse_transform(sequence[self.case_activity_key].astype(int))
    return sequence

`fill_up_log(upper, non_stop, random_cuts, cut_length, input_sequences, cuts)`

do the predictions for each cut sequence and extend the event log so that it now constains the predictions.

Source code in server/process_model_manager.py

def fill_up_log(self, upper , non_stop , random_cuts , cut_length , input_sequences, cuts):
    """
    do the predictions for each cut sequence and extend the event log so that 
    it now constains the predictions. 
    """
    #: initialize prediction manager.
    pm=prediction_manager.PredictionManager(
        self.model, 
        self.case_id_key, 
        self.case_activity_key, 
        self.case_timestamp_key, 
        self.config
    )
    pm.end_activities = self.end_activities

    for i, sequence in enumerate(tqdm(input_sequences)): 
        case_id = sequence[self.case_id_key].iloc[1]
        #: do the predictions in the corresponding mode
        pm.multiple_prediction_dataframe(
            cuts[case_id][2],
            1,
            sequence,
            linear = True,  
            non_stop=non_stop,
            upper = upper,
        )
        prediction = pm.paths[0] 
        extension = {
            self.case_id_key:[],
            self.case_activity_key:[],
            self.case_timestamp_key:[]
        }
        #: arrange the predictions in the extension dictionary
        # the first prediction is not used because it is just information 
        # use for knowing what was the last timestamp recorded
        for time, (pred, event) in prediction[1:]:
            extension[self.case_id_key] = [case_id]
            extension[self.case_activity_key]= [event]
            extension[self.case_timestamp_key]= [time]

        extension = pd.DataFrame(extension)
        #: here compute cumulative sum of times + last timestamp recorded



        extension[self.case_timestamp_key] = extension[self.case_timestamp_key]*self.config.exponent
        extension[self.case_timestamp_key] = extension[self.case_timestamp_key].cumsum()
        extension[self.case_timestamp_key] = extension[self.case_timestamp_key] + prediction[0][0]

        #: transform extension to dtaframe and extend the predictive df now with the predictions
        self.predictive_df= pd.concat([self.predictive_df, extension], ignore_index = True)

    self.predictive_df=  self.predictive_df.sort_values(by=[self.case_id_key, self.case_timestamp_key])
    self.predictive_df = self.decode_df(self.predictive_df)

`format_columns()`

exporting to csv changes the datetime types to object, but we need them to be datetime.

Source code in server/process_model_manager.py

def format_columns(self): 
    """
    exporting to csv changes the datetime types to object, but we need them to be 
    datetime.  
    """
    self.predictive_df[self.case_timestamp_key] = self.predictive_df[self.case_timestamp_key].astype("datetime64[ns, UTC]")
    self.predictive_df[self.case_id_key] = self.predictive_df[self.case_id_key].astype("str")
    #self.predictive_df[self.case_activity_key] = self.config.activity_le.inverse_transform(self.predictive_df[self.case_activity_key].astype(int))
    self.predictive_df[self.case_activity_key] = self.predictive_df[self.case_activity_key].astype("str")

`generate_predictive_log(new_log_path, max_len=15, upper=30, non_stop=False, random_cuts=False, cut_length=0)`

generates a predictive log. each process is cut at some given index, and the model is used to reconstruct the rest of the process. there are so far three possible modi for cutting and prediction generation:
- for tail cuts: set cut_length value and set random_cuts to false - for random cuts with cut memory: random_cuts to true and non_stop to false - for random cuts nonstop: random_cuts to true and non_stop totrue

Parameters:

Name	Type	Description	Default
`max`	`len`	max length for the cut sequences ie max sequence input size length.	required
`upper`		upperbound for the non stop random cutter ie how long to run before reaching end state.	`30`
`non_stop`		must be set to true if the predictions are done until reaching final marking.	`False`
`random_cuts`		set to true to cut in random indices.	`False`
`cut_length`		in case of cutting fix tail lengths, select the tail length to cut for all sequences.	`0`
`upper`		upper bound for how many iterations a non stop iterative predictor should run.	`30`

Source code in server/process_model_manager.py

def generate_predictive_log(self, new_log_path, max_len= 15, upper = 30, non_stop = False, random_cuts = False, cut_length = 0): 
    """
    generates a predictive log. each process is cut at some given index, and the model is used to 
    reconstruct the rest of the process. there are so far three possible modi for cutting and prediction generation:  
    - for tail cuts: set cut_length value and set random_cuts to false
    - for random cuts with cut memory: random_cuts to true and non_stop to false
    - for random cuts nonstop: random_cuts to true and non_stop totrue 

    Args:
        max len: max length for the cut sequences ie max sequence input size length.
        upper:  upperbound for the non stop random cutter ie how long to run before reaching end state. 
        non_stop: must be set to true if the predictions are done until reaching final marking.
        random_cuts: set to true to cut in random indices. 
        cut_length: in case of cutting fix tail lengths, select the tail length to cut for all sequences.
        upper: upper bound for how many iterations a non stop iterative predictor should run.
    """

    case_id_counts, cuts, input_sequences, cuts = self.initialize_variables()

    self.predictive_df = pd.DataFrame(self.predictive_df)
    if random_cuts: 
        case_id_counts,cuts, input_sequences= self.random_cutter(case_id_counts, max_len,cuts,  input_sequences)
    else: 
        if cut_length ==0: 
            raise exceptions.CutLengthZero()
        case_id_counts,cuts, input_sequences= self.tail_cutter(case_id_counts, cut_length,cuts,  input_sequences)


    self.check_too_short(input_sequences)

    self.fill_up_log( upper , non_stop , random_cuts , cut_length , input_sequences, cuts)

    self.predictive_df.to_csv(new_log_path, sep = ",")

`handle_nat(group)`

the inverse transformation for timestamps is a lossy transformation and might lead to NaT entries. a timedelta of k second's with respect to the last valid timestamp is set as a timestamp value for the kth NaT entry. :param group: a group in the predictive df that contains only one case id. :return: the same group now with valid timestamps

Source code in server/process_model_manager.py

def handle_nat(self, group):
    """
    the inverse transformation for timestamps is a lossy transformation and might lead to NaT entries. 
    a timedelta of k second's with respect to the last valid timestamp is set as a timestamp value for 
    the kth NaT entry.
    :param group: a group in the predictive df that contains only one case id. 
    :return: the same group now with valid timestamps
    """
    last_valid_idx = group[self.case_timestamp_key].last_valid_index()
    if last_valid_idx is None:
        return group
    print("found nan")
    last_valid_timestamp= group.loc[last_valid_idx, self.case_timestamp_key]

    nat_indices = group.index[group[self.case_timestamp_key].isna()]
    for i, idx in enumerate(nat_indices):
        group.at[idx, self.case_timestamp_key] = last_valid_timestamp+ pd.Timedelta(days=i + 1)

    return group     

`heuristic_miner(path, dependency_threshold=0.5, and_threshold=0.65, loop_two_threshold=0.5, view=False)`

Run heuristic miner on the predictive log and generate a petri net.

Parameters:

Name	Type	Description	Default
`path`	`str`	Path used for saving the generated petri net.	required
`dependency_threshold`	`float`	Dependency threshold parameter for heuristic miner.	`0.5`
`and_threshold`	`float`	AND threshold parameter for heuristic miner.	`0.65`
`loop_two_threshold`	`float`	Loop two threshold parameter for heuristic miner.	`0.5`

Source code in server/process_model_manager.py

def heuristic_miner(self,path,  dependency_threshold=0.5, and_threshold=0.65, loop_two_threshold=0.5, view= False):
    """
    Run heuristic miner on the predictive log and generate a petri net.

    Args:
        path (str): Path used for saving the generated petri net.
        dependency_threshold (float): Dependency threshold parameter for heuristic miner.
        and_threshold (float): AND threshold parameter for heuristic miner.
        loop_two_threshold (float): Loop two threshold parameter for heuristic miner.
    """
    self.format_columns()
    self.petri_net, self.initial_marking, self.final_marking = pm4py.discover_petri_net_heuristics(
        self.predictive_df,
        dependency_threshold, 
        and_threshold, 
        loop_two_threshold, 
        activity_key=self.case_activity_key,
        timestamp_key=self.case_timestamp_key,
        case_id_key= self.case_id_key
    )
    #: export the petri net in the given path
    pm4py.write_pnml(self.petri_net,self.initial_marking, self.final_marking, file_path=path)
    pm4py.save_vis_petri_net(self.petri_net, self.initial_marking, self.final_marking, file_path = path+".png")

`import_predictive_df(path)`

used for importing a predictive df.

Source code in server/process_model_manager.py

def import_predictive_df(self, path):
    """
    used for importing a predictive df. 
    """
    self.predictive_df = pd.read_csv(path, sep = ",")

`inductive_miner(path, noise_threshold=0)`

Run inductive miner on the predictive log and generate a petri net.

Parameters:

Name	Type	Description	Default
`path`	`str`	Path used for saving the generated petri net.	required
`noise_threshold`	`float`	Noise threshold parameter for inductive miner.	`0`

Source code in server/process_model_manager.py

def inductive_miner(self, path,   noise_threshold=0):
    """
    Run inductive miner on the predictive log and generate a petri net.

    Args:
        path (str): Path used for saving the generated petri net.
        noise_threshold (float): Noise threshold parameter for inductive miner.
    """
    self.format_columns()
    self.petri_net, self.initial_marking, self.final_marking = pm4py.discover_petri_net_inductive(
        self.predictive_df,
        noise_threshold, 
        self.case_activity_key,
        self.case_timestamp_key,
        self.case_id_key
    )
    #pm4py.view_petri_net(self.petri_net, self.initial_marking, self.final_marking, format='svg')
    pm4py.write_pnml(self.petri_net,self.initial_marking, self.final_marking, file_path=path)
    pm4py.save_vis_petri_net(self.petri_net, self.initial_marking, self.final_marking, file_path = path+".png")

`initialize_variables()`

initialize variabels for predictive log generator

Source code in server/process_model_manager.py

def initialize_variables(self):
    """
    initialize variabels for predictive log generator
    """
    case_id_counts = self.event_df[self.case_id_key].value_counts()
    cuts = []
    self.predictive_df = {
        self.case_id_key:[],
        self.case_activity_key:[],
        self.case_timestamp_key:[]
    }
    input_sequences = []
    cuts = {}
    return case_id_counts, cuts, input_sequences, cuts

`prefix_tree_miner(path)`

Run prefix tree miner on the predictive log and generate a petri net.

Parameters:

Name	Type	Description	Default
`path`	`str`	Path used for saving the generated petri net.	required

Source code in server/process_model_manager.py

def prefix_tree_miner(self, path):
    """
    Run prefix tree miner on the predictive log and generate a petri net.

    Args:
        path (str): Path used for saving the generated petri net.
    """
    self.format_columns()
    self.petri_net, self.initial_marking, self.final_marking = pm4py.discover_prefix_tree(
        self.predictive_df,
        self.case_activity_key,
        self.case_timestamp_key,
        self.case_id_key
    )
    #pm4py.view_petri_net(self.petri_net, self.initial_marking, self.final_marking, format='svg')
    pm4py.write_pnml(self.petri_net,self.initial_marking, self.final_marking , file_path=path)
    pm4py.save_vis_petri_net(self.petri_net, self.initial_marking, self.final_marking, file_path = path+".png")

`random_cutter(case_id_counts, max_len, cuts, input_sequences)`

Cuts each sequence contained in input_sequences at random indices.

Parameters:

Name	Type	Description	Default
`cuts`	`dict`	The cut index and cut length are preserved.	required
`case_id_counts`	`Series`	Number of rows for each case_id.	required
`max_len`	`int`	Max length that the input sequence can have. Can be set to improve runtime. TODO: allow INF for max_len.	required
`input_sequences`	`list`	List of sequences to be cut.	required

Source code in server/process_model_manager.py

def random_cutter(self, case_id_counts, max_len, cuts, input_sequences):
    """
    Cuts each sequence contained in input_sequences at random indices.

    Args:
        cuts (dict): The cut index and cut length are preserved.
        case_id_counts (pd.Series): Number of rows for each case_id.
        max_len (int): Max length that the input sequence can have. Can be set to improve runtime.
               TODO: allow INF for max_len.
        input_sequences (list): List of sequences to be cut.
    """
    for i, case_id in enumerate(case_id_counts.index):
        count = case_id_counts.loc[case_id]
        sequence = self.event_df[self.event_df[self.case_id_key]==case_id] 
        if count<=self.config.seq_len or count>max_len:
            continue
        cut = random.randint(self.config.seq_len+1, count)
        sequence = sequence.iloc[:cut]
        cuts[case_id]= (count, cut, count-cut)
        input_sequences.append(sequence)
        self.predictive_df= pd.concat([self.predictive_df, sequence], ignore_index = True)
    return case_id_counts, cuts, input_sequences 

`tail_cutter(case_id_counts, cut_length, cuts, input_sequences)`

cut sequences cut_length steps from the tail. :param cut_length: how many steps to cut from the tail of each sequence. :param case_id_counts: number of steps on each case_id :param input_sequences: list of sequences to be cut.

Side effect: the predictive_df is extended with the cut sequences.

Source code in server/process_model_manager.py

def tail_cutter(self, case_id_counts, cut_length, cuts, input_sequences):
    """
    cut sequences cut_length steps from the tail.
    :param cut_length: how many steps to cut from the tail of each sequence. 
    :param case_id_counts: number of steps on each case_id
    :param input_sequences: list of sequences to be cut. 

    Side effect: the predictive_df is extended with the cut sequences.
    """
    for case_id in case_id_counts.index:
        count = case_id_counts.loc[case_id]
        cut = random.randint(1, count)
        cut = count-min(cut_length, cut)
        sequence = self.event_df[self.event_df[self.case_id_key]==case_id]
        sequence = sequence.iloc[:cut]
        if len(sequence) <= self.config.seq_len:
            continue
        cuts[case_id]= (count, cut, count-cut)
        input_sequences.append(sequence)

        self.predictive_df= pd.concat([self.predictive_df, sequence], ignore_index = True)

    return case_id_counts, cuts, input_sequences

Backend documentation

Preprocessing module

Preprocessing

add_unique_start_end_activity()

encode_df_columns()

find_end_activities()

find_start_activities()

get_sample_case()

handle_import(is_xes, path, case_id, timestamp, activity, time_precision=time_precision.TimePrecision.NS, sep=',', formatting=True)

import_event_log(formatting)

import_event_log_csv(path, sep, formatting=True)

import_event_log_dataframe(df, case_id, activity_key, timestamp_key, formatting=True)

import_event_log_xes(path, formatting=True)

replace_activity_nan_with_mode()

split_train_test(train_percentage)

string_to_index(df, column)

xes_helper(path)

NN management module

Config

asdict()

dict_to_encoder(dic)

encoder_to_dict(encoder)

load_config(dic)

NNManagement

evaluate()

export_nn_model(name='trained_model.pt')

get_training_statistics()

grid_search(search_parameters)

import_nn_model(path)

load_data(train_data, test_data, case_id, timestamp_key, event_key)

random_search(search_parameters, iterations)

train()

Prediction management module

PredictionManager

__init__(model, case_id_key, activity_key, timestamp_key, config)

append_one_difference_array(lst)

append_to_log(time, event)

backtracking_prediction_tree(c_t, c_e, c_d, depth, degree, current_path)

check_input_uniqueness()

decode_paths()

get_differences()

get_dummy_process(df, case_id_column)

jsonify_paths()

jsonify_single(time_pred, event_pred, prob)

linear_iterative_predictor(depth, start_time, start_event)

linear_iterative_predictor_non_stop(start_time, start_event, upper=float('inf'))

multiple_prediction(depth, degree)

multiple_prediction_dataframe(depth, degree, df, linear=False, non_stop=False, upper=30)

multiple_prediction_linear(depth, nonstop, upper)

pop_from_log()

single_prediction()

single_prediction_dataframe(df)

Process model manager module

ProcessModelManager

alpha_miner(path)

decode_df(df)

decode_sequence(sequence)

fill_up_log(upper, non_stop, random_cuts, cut_length, input_sequences, cuts)

format_columns()

generate_predictive_log(new_log_path, max_len=15, upper=30, non_stop=False, random_cuts=False, cut_length=0)

handle_nat(group)

heuristic_miner(path, dependency_threshold=0.5, and_threshold=0.65, loop_two_threshold=0.5, view=False)

import_predictive_df(path)

inductive_miner(path, noise_threshold=0)

initialize_variables()

prefix_tree_miner(path)

random_cutter(case_id_counts, max_len, cuts, input_sequences)

tail_cutter(case_id_counts, cut_length, cuts, input_sequences)

`Preprocessing`

`add_unique_start_end_activity()`

`encode_df_columns()`

`find_end_activities()`

`find_start_activities()`

`get_sample_case()`

`handle_import(is_xes, path, case_id, timestamp, activity, time_precision=time_precision.TimePrecision.NS, sep=',', formatting=True)`

`import_event_log(formatting)`

`import_event_log_csv(path, sep, formatting=True)`

`import_event_log_dataframe(df, case_id, activity_key, timestamp_key, formatting=True)`

`import_event_log_xes(path, formatting=True)`

`replace_activity_nan_with_mode()`

`split_train_test(train_percentage)`

`string_to_index(df, column)`

`xes_helper(path)`

`Config`

`asdict()`

`dict_to_encoder(dic)`

`encoder_to_dict(encoder)`

`load_config(dic)`

`NNManagement`

`evaluate()`

`export_nn_model(name='trained_model.pt')`

`get_training_statistics()`

`grid_search(search_parameters)`

`import_nn_model(path)`

`load_data(train_data, test_data, case_id, timestamp_key, event_key)`

`random_search(search_parameters, iterations)`

`train()`

`PredictionManager`

`init(model, case_id_key, activity_key, timestamp_key, config)`

`append_one_difference_array(lst)`

`append_to_log(time, event)`

`backtracking_prediction_tree(c_t, c_e, c_d, depth, degree, current_path)`

`check_input_uniqueness()`

`decode_paths()`

`get_differences()`

`get_dummy_process(df, case_id_column)`

`jsonify_paths()`

`jsonify_single(time_pred, event_pred, prob)`

`linear_iterative_predictor(depth, start_time, start_event)`

`linear_iterative_predictor_non_stop(start_time, start_event, upper=float('inf'))`

`multiple_prediction(depth, degree)`

`multiple_prediction_dataframe(depth, degree, df, linear=False, non_stop=False, upper=30)`

`multiple_prediction_linear(depth, nonstop, upper)`

`pop_from_log()`

`single_prediction()`

`single_prediction_dataframe(df)`

`ProcessModelManager`

`alpha_miner(path)`

`decode_df(df)`

`decode_sequence(sequence)`

`fill_up_log(upper, non_stop, random_cuts, cut_length, input_sequences, cuts)`

`format_columns()`

`generate_predictive_log(new_log_path, max_len=15, upper=30, non_stop=False, random_cuts=False, cut_length=0)`

`handle_nat(group)`

`heuristic_miner(path, dependency_threshold=0.5, and_threshold=0.65, loop_two_threshold=0.5, view=False)`

`import_predictive_df(path)`

`inductive_miner(path, noise_threshold=0)`

`initialize_variables()`

`prefix_tree_miner(path)`

`random_cutter(case_id_counts, max_len, cuts, input_sequences)`

`tail_cutter(case_id_counts, cut_length, cuts, input_sequences)`