import functools
import json
import os
import re
import shutil
import sys
import time
import requests
import xml.etree.ElementTree as ET
from ctypes import POINTER, c_char, c_char_p, cast, cdll, create_string_buffer, string_at
from pathlib import Path
from threading import Timer

import psutil

try:
    from winreg import (
        ConnectRegistry,
        OpenKey,
        HKEY_LOCAL_MACHINE,
        KEY_ALL_ACCESS,
        QueryValueEx,
        EnumValue,
        DeleteValue,
        SetValueEx,
        REG_DWORD,
        REG_SZ,
    )
except:
    REG_DWORD = None
    REG_SZ = None
# Global Constants
CVREG = None
INSTANCE_NAME = None
LOG_DIR = None
CLIENT_NAME = None
REG_KEY_VALUES = {}

LOG_CHECK_TIMER = 60 * 2
CHECK_SOLR_TIMER = 60 * 5
UPDATE_DB_TIMER = 60 * 5
PARENT_ALIVE_CHECK_INTERVAL = 30
ROTATING_MAX_BYTES = 5 * 1024 * 1024
ROTATING_BACKUP_COUNT = 1000
STOMP_PORT = 61613
SPACY_MODELS_REG_KEY = "sEESpacyModelsInstallDir"
MODELS_INSTALLATION_PATH_REG_KEY = "sEEModelsInstallDir"
DOCUMENT_CLASSIFY_MODELS_REG_KEY = "sEEDocumentClassifyModelsInstallDir"
CATEGORY_BASED_MODELS_REG_KEY = "sEECategoryBasedModels"
FFMPEG_REG_KEY = "sEEFFmpegPath"
SOLR_TAGGER_PORT = "contentPreviewPort"
SOLR_TAGGER_URL = (
    "http://localhost:{}/solr/geo_In/cvtagging?overlaps=NO_SUB&rows=0&wt=json&matchText=true"
)
SPACY_MODELS_MAP = {
    "en_core_web_sm": "en_core_web_sm-2.0.0",
    "en_core_web_lg": "en_core_web_lg-2.0.0/en_core_web_lg/en_core_web_lg-2.0.0",
}
DOCUMENT_CLASSIFY_MODELS_MAP = '[{"model_type" : "content_based","model_library" : "spacy","model_path" : "document_classify_v1.1"},{"model_type" : "structure_based","model_library" : "sklearn","model_path" : "structure_classify_v1.1/structure_model.pkl"}]'
PERSON_TECHTERM_CLASSIFIER_MODEL = "cv_person_techterm_classifier/cv_person_techterm_classifier.pkl"
IS_ADDRESS_CLASSIFIER_MODEL = "cv_address_classifier/cv_address_classifier.pkl"
LOADED_DLL = None
GET_ENTITY_SESSION = requests.Session()

# RER global constants
WORD_BOUNDARY_START = "(?:(?:[\\s\\,\\>]|\\b)"
WORD_BOUNDARY_END = "(?:[\\s\\,\\<\\.]|\\b))"
NER_GLOBAL = {"is_structure": False}
# LOG_LEVELS = {4: "DEBUG", 3: "INFO", 2: "WARNING", 1: "ERROR", 0: "CRITICAL"}
# as defined in EvDebug.h
LOG_LEVELS = {
    -1: "ERROR",
    0: "WARNING",
    1: "INFO",  # LVL_DEBUG
    2: "DEBUG",  # LVL_DEBUG_LOW
    3: "VERBOSE",  # LVL_DEBUG_LOW3
}

REG_CONF = {
    "max_threads": {"type": int, "regKey": "sEEMaxThreads", "value": 10},
    "check_entities": {"type": bool, "regKey": "sEECheckExtractedEntities", "value": True},
    "check_failures": {"type": bool, "regKey": "sEECheckFailures", "value": True},
    "extract_timeout": {"type": int, "regKey": "sEETimeout", "value": 60},
    "log_level": {
        "type": int,
        "regKey": "sEELogLevel",
        "value": list(LOG_LEVELS.keys())[list(LOG_LEVELS.values()).index("INFO")],
    },
    "log_max_bytes": {"type": int, "regKey": "sEELogMaxBytes", "value": 5 * 1024 * 1024},
    "profiling": {"type": bool, "regKey": "sEEProfiling", "value": False},
    "log_backup_count": {"type": int, "regKey": "sEELogBackupCount", "value": 2},
    "stompPort": {"type": int, "regKey": "", "value": 8055},
    "ocr_threshold": {"type": int, "regKey": "sOCRThreshold", "value": 75},
    "use_re2": {"type": bool, "regKey": "sEEUseRe2", "value": True},
    "use_btrer": {"type": bool, "regKey": "sEEUseBtRER", "value": False},
    "deep_validate": {
        "type": bool,
        "regKey": "sEEDeepValidate",
        "value": True,
    },  # enabling deep validations. These changes are already checked in with Form 73620 in SP17
    "pre_process_text": {"type": bool, "regKey": "sEEPreProcessText", "value": True},
    "low_perf": {"type": bool, "regKey": "sEELowPerfMode", "value": True},
    "docs_per_batch": {"type": int, "regKey": "sEENumDocsPerBatch", "value": 100},
    "batches_per_publish": {"type": int, "regKey": "sEENumBatchesPerPublish", "value": 100},
    "proximity_conf": {
        "type": str,
        "regKey": "sEEProximityConf",
        "value": json.dumps(
            {
                "base_conf": 50,
                "upper_conf": 90,
                "range": 300,
                "max_matches": 3,
                "boost": 0.25,
                "buckets": {50: 0, 60: 1, 70: 2},
            }
        ),
    },
    "generic_task_kill_timeout": {"type": int, "regKey": "sEEGenericTaskKillTimeout", "value": 120},
    "memory_time_check": {"type": int, "regKey": "sEEMemoryCheckTimeInterval", "value": 30},
    "generic_memory_limit": {
        "type": int,
        "regKey": "sEEGenericMemoryLimit",
        "value": 5,
        "keyType": REG_DWORD,
    },
    "generic_task_gc_timer": {"type": float, "regKey": "sEEGenericTaskGCTimer", "value": 120},
    "generic_perf_stat": {"type": bool, "regKey": "sEEPrintPerfStat", "value": True},
    "generic_perf_stat_print_interval": {
        "type": int,
        "regKey": "sEEPerfStatInterval",
        "value": 300,
    },
    "sp_ner_pipe": {"type": bool, "regKey": "sEESPNERPipe", "value": True},
    "sp_ner_size_limit": {"type": int, "regKey": "sEESPNERSizeLimit", "value": 5 * 1024 * 1024},
    "sp_ner_pipe_min_size": {
        "type": int,
        "regKey": "sEESPNERPipeMinSize",
        "value": 1 * 1024 * 1024,
    },
    "sp_ner_use_nltk_tokenizer": {
        "type": bool,
        "regKey": "sEESPNERUseNLTKTokenizer",
        "value": True,
    },
    "child_docs": {"type": bool, "regKey": "sEEBreakChildDocs", "value": False},
    "cache_clean_interval": {"type": int, "regKey": "sEEEntityCacheCleanInterval", "value": 30},
    "sp_ner_usecustompipeline": {
        "type": bool,
        "regKey": "sEESPNERUseCustomPipeline",
        "value": True,
    },
    "sp_ner_custompipeline": {
        "type": str,
        "regKey": "sEESPNERCustomPipe",
        "value": "sent,set_entity_label,surname_attr,solr_tagger,cc_cvterms_cvterm_test,address,filtergz",
    },
    "sp_ner_probcutoff": {"type": float, "regKey": "sEESPNERMinScore", "value": 0.0},
    "use_document_classifier": {"type": bool, "regKey": "sEEUseDocumentClassifier", "value": True},
    "enable_remote_debug": {"type": bool, "regKey": "enableRemoteDebug", "value": False},
    "activemq_send_timeout": {
        "type": int,
        "regKey": "sendTimeOut",
        "value": 900000,
        "keyType": REG_DWORD,
    },
    "activemq_persistent_enabled": {
        "type": bool,
        "regKey": "nEnablePersistent",
        "value": False,
        "keyType": REG_DWORD,
    },
    "activemq_producer_prefetch_size": {
        "type": int,
        "regKey": "prefetchSize",
        "value": 1000,
        "keyType": REG_DWORD,
    },
    "activemq_consumer_prefetch_size": {
        "type": int,
        "regKey": "consumerPrefetchSize",
        "value": 1,
        "keyType": REG_DWORD,
    },
    "document_classify_models_reg_key": {
        "type": str,
        "regKey": "sEEDocumentClassifyModelsInstallDir",
        "value": '[{"model_type" : "content_based","model_library" : "spacy","model_path" : "document_classify_v1.1"},{"model_type" : "structure_based","model_library" : "sklearn","model_path" : "structure_classify_v1.1\\structure_model.pkl","transformer_path" : "structure_classify_v1.1\\structure_transformer.pkl"}]',
    },
    "category_based_models_reg_key": {
        "type": str,
        "regKey": "sEECategoryBasedModels",
        "value": '{"RESUMES" : {"model_path" : "en_resumes_trained_lg"},"LOGS" : {"model_path" : "en_logs_trained_lg"},"OTHERS" : {"model_path" : "en_core_web_lg-2.0.0\\en_core_web_lg\\en_core_web_lg-2.0.0"}}',
    },
    "ner_exclude_list": {
        "type": str,
        "regKey": "sEESPNERExcludeList",
        "value": '{"PERSON" : ["PERSON_exclusion.list"] }',
    },
    "sp_ner_fetchprobability": {
        "type": int,
        "regKey": "bEEFetchProbability",
        "value": 0,
        "keyType": REG_DWORD,
    },
    "sp_ner_usetrainedmodels": {
        "type": int,
        "regKey": "bEEUseTrainedModels",
        "value": 0,
        "keyType": REG_DWORD,
    },
    "sp_ner_structure_documents_handling": {
        "type": int,
        "regKey": "bNERStructureDocumentHandling",
        "value": 1,
        "keyType": REG_DWORD,
    },
    "ner_processing_timeout": {
        "type": int,
        "regKey": "bNERProcessingTimedOut",
        "value": 600,  # 10 minutes
        "keyType": REG_DWORD,
    },
    "mkl_num_threads": {"type": int, "regKey": "bMKLNumThreads", "value": 1, "keyType": REG_DWORD},
    "phone_entity_leniency": {
        "type": int,
        "regKey": "bPhoneEntityLeniency",
        "value": 0,
        "keyType": REG_DWORD,
    },  # possible values 0 (possible) or 1 (valid)
    "last_library_version": {
        "type": int,
        "regKey": "lastLibraryVersion",
        "value": 0,
        "keyType": REG_DWORD,
    },  # to check if there are new packages to install
    "default_categories": {
        "type": str,
        "regKey": "sDefaultCategories",
        "value": "finance,legal,technical",
    },
    "mlflow_server_port": {"type": int, "regKey": "bMlflowServerPort", "value": 5004},
    "entity_extraction_timeout": {
        "type": int,
        "regKey": "entityExtractionTimeout",
        "value": 5,
        "keyType": REG_DWORD,
    },
    "num_doc_tagger_process": {
        "type": int,
        "regKey": "bNumDocTaggerProcess",
        "value": 1,
        "keyType": REG_DWORD,
    },
    "extract_all_date_formats": {
        "type": int,
        "regKey": "bExtractAllDateFormats",
        "value": 0,
        "keyType": REG_DWORD,
    },
}

DOCUMENT_RESPONSE_CODES = {
    "success": 0,
    "previewFailed": 1,
    "solrRequestFailed": 2,
    "timedOut": 3,
    "queueException": 4,
    "invalidRegex": 5,
}

CA_ERROR_CODES = {
    "success": 10000,
    "moduleLoadError": 10501,
    "loggerNotFound": 10502,
    "BitextPreProcessingError": 10503,
    "FDPreProcessingError": 10504,
    "OCRPreProcessingError": 10505,
    "VPPreProcessingError": 10506,
    "RERPreProcessingError": 10507,
    "SpacyPreProcessingError": 10508,
    "RERError": 10509,
    "RegexError": 10510,
    "OCRError": 10511,
    "BitextError": 10512,
    "SpacyError": 10001,
    "FDError": 10513,
    "VPError": 10514,
    "SpacySizeExceeded": 10515,
    "SpacyFileNotSupported": 10516,
    "NoTextToProcess": 10517,
    "ContentFetchFailed": 10518,
    "MemoryLimitReached": 10519,
    "NERTimedOut": 10520,
    "SP_NERTimedOut": 10520,
    "InsufficientResourcesForNER": 10521,
    "CommserverNotReachable": 10522,
    "RERTimedOut": 10523,
    "DETimedOut": 10524,
    "DOC_TAGGERTimedOut": 10525,
    # training failures 10600-10699
    "CancelTrainingFailed": 10600,
    "ModelTrainingFailed": 10601,
    "NotEnoughData": 10602,
    "LowAccuracyLessData": 10603,
    "LowAccuracyDissimlarData": 10604,
    "TrainCancelByUser": 10605,
    # classifier prediction failures 10700-10799
    "ModelNotTrained": 10700,
    "ModelDoesNotExist": 10701,
    "EntityDoesNotExist": 10702,
    "ClassificationFailed": 10703,
}

CA_TASK_LIST = [
    "RER",
    "SP_NER",
    "DE",
    "DOC_TAGGER",
    "EMAIL_TAGGER",
]  # these tasks are not getting used anymore, "FACE_DETECTION", "OCR", "VIDEO_PREVIEW"

CA_TASK_MODULE_MAP = {
    "RER": "CvEEClient",
    "SP_NER": "CvEENERClient",
    # "FACE_DETECTION": "CvCAFaceDetect",
    # "OCR": "CvCATextDetect",
    # "VIDEO_PREVIEW": "CvCAVideoPreview",
    "DE": "CvCADEClient",
    "DOC_TAGGER": "cvee_classification_task",
    "EMAIL_TAGGER": "cvee_emailcluster_task",
}

CA_SUBTASK_MODULE_MAP = {
    "DOC_TAGGER": {"TRAIN": "cvee_train_classifier", "TEST": "cvee_one_class_classifier"}
}

"""
    Config for Video Preview
"""

VIDEO_PREVIEW_CONFIG = {
    "gif": {
        "scale": {
            "value": "-2:100",
            "description": "The scale of the output images; a string with format width:height, with a -2 meaning that one dimension scales with the other to preserve the aspect ratio.",
            "ValueExamples": ["-2:500", "500:-2", "200:500"],
        },
        "framenum": {
            "value": 7,
            "description": "Positive integer value of frames to take for a gif.",
            "ValueExamples": [1, 5, 10],
        },
        "framedur": {
            "value": 1,
            "description": "Postive number value of duration of frame in gif in seconds",
            "ValueExamples": [1, 2, 3],
        },
        "clip": {
            "value": ".gif",
            "description": "String with file ending dictating whether preview should consist of clips ('.mp4') or frames ('.png')",
            "PossibleValues": [".mp4", ".png"],
        },
        "timstart": {
            "value": -1,
            "description": "Integer time in seconds relative to the beginning of a frame to begin capturing a clip",
            "ValueExamples": [-1, 1, 0],
        },
        "timend": {
            "value": 1,
            "description": "Integer time in seconds relative to the end of a frame to stop capturing a clip",
            "ValueExamples": [1, -1, 0],
        },
        "audiomute": {"value": "", "description": "Don't change"},
    },
    "mpg": {
        "scale": {
            "value": "-2:500",
            "description": "The scale of the output video; a string with format width:height, with a -2 meaning one dimension scales with the other to preserve the aspect ratio.",
            "ValueExamples": ["-2:500", "500:-2", "200:500"],
        },
        "ratio2original": {
            "value": 0.1,
            "description": "Positive floating point number representing the ratio of the preview video length to the original video length, with e.g. 0.1 representing 10%",
            "ValueExamples": [0.1, 0.04, 0.9],
        },
        "sample_rate": {
            "value": 8000,
            "description": "Positive integer representing sampling rate of audio, currently not used",
            "ValueExamples": [44100, 24000, 8000],
        },
        "timstart": {
            "value": -1,
            "description": "Integer time in seconds relative to the beginning of a frame to begin capturing a clip",
            "ValueExamples": [-1, 1, 0],
        },
        "timend": {
            "value": 1,
            "description": "Integer time in seconds relative to the end of a frame to stop capturing a clip",
            "ValueExamples": [1, -1, 0],
        },
        "audiomute": {
            "value": "",
            "description": "String representing whether audio should be muted in the preview, with ' -an' representing muted audio and '' representing unmuted audio",
            "PossibleValues": ["", " -an"],
        },
        "clip": {"value": ".mpg", "description": "Don't change"},
    },
    "framelen": {
        "value": 2,
        "descruotion": "Positive integer value representing length in seconds of each section of video being compared",
        "ValueExamples": [1, 2, 3, 4],
    },
    "output": {"gif": True, "vid": True},
}

"""
    Util functions that are used everywhere
"""


def is_linux():
    if "linux" in sys.platform:
        return True
    else:
        return False


def get_helper_dll():
    if is_linux() == True:
        return "libCVJNILoader.so"
    else:
        return "CVCIEntityExtractionCsApi.dll"


HELPER_DLL = get_helper_dll()


def getInstanceName():
    global INSTANCE_NAME
    try:
        if INSTANCE_NAME == None:
            if is_linux() == True:
                with open("../galaxy_vm") as f:
                    INSTANCE_NAME = re.findall('GALAXY_VM="([^";]+)', f.read())[0]
            else:
                with open("QinetixVM") as fp:
                    INSTANCE_NAME = fp.readline()
    except Exception as e:
        raise
    return INSTANCE_NAME.strip()


def getLogDir():
    global LOG_DIR
    instanceName = ""
    if LOG_DIR != None:
        return LOG_DIR
    try:
        instanceName = getInstanceName()
        if is_linux() == True:
            log_reg_path = os.path.join(
                get_cvreg_linux(), "Galaxy", instanceName, "EventManager", ".properties"
            )
            with open(log_reg_path) as f:
                LOG_DIR = re.findall("dEVLOGDIR\s+([^\s]+)", f.read())[0]
        else:
            aReg = ConnectRegistry(None, HKEY_LOCAL_MACHINE)
            aKey = OpenKey(
                aReg, "SOFTWARE\\CommVault Systems\\Galaxy\\{0}\\EventManager".format(instanceName)
            )

            for x in range(0, 1024):
                (key, value, typ) = EnumValue(aKey, x)
                if key == "dEVLOGDIR":
                    LOG_DIR = value
    except Exception as e:
        print(e)
    return LOG_DIR if LOG_DIR != None else ""


def getPythonSitePackagesDir():
    import sys

    for path in sys.path:
        if path.endswith("site-packages") and "Anaconda" in path:
            return path


def get_cvreg_linux():
    global CVREG
    try:
        if CVREG == None:
            with open("../galaxy_vm") as f:
                CVREG = re.findall('CVREG="([^";]+)', f.read())[0]
    except Exception as e:
        raise
    return CVREG


def get_reg_path():
    return os.path.join(get_cvreg_linux(), "Galaxy", getInstanceName())


def load_all_parent_keys(registry_parent):
    """
        for each given registry_parent like ContentAnalyzer, Base, EventManager, MessageQueue
        cache all the key value pairs for faster lookup
    """
    global REG_KEY_VALUES
    if registry_parent not in REG_KEY_VALUES:
        REG_KEY_VALUES[registry_parent] = {}
        reg_path = os.path.join(get_reg_path(), registry_parent, ".properties")
        with open(reg_path) as f:
            key_value_result = re.compile("(\w+)(?:[ \t]+)([^\s]+)\n?").findall(f.read())
            for key_value in key_value_result:
                if type(key_value) == tuple and len(key_value) == 2:
                    key_, value_ = key_value
                    REG_KEY_VALUES[registry_parent][key_] = value_
    return REG_KEY_VALUES[registry_parent]


def get_reg_key_value(registry_parent, key):
    key_value_map = load_all_parent_keys(registry_parent)
    if key in key_value_map:
        return key_value_map[key]
    return None


def loadSpecificValue(key, item, value, type=int, key_type=REG_SZ):
    if type != str:
        value = int(value)
    if is_linux() == True:
        try:
            item_value = get_reg_key_value(key, item)
            result = item_value if item_value != None else value
        except Exception as e:
            result = value
    else:
        try:
            result = QueryValueEx(key, item)[0]
        except WindowsError as e:
            result = value
    if type == bool:
        result = int(result)
    return type(result)


def loadRegValue(key, value, type=int, path="ContentAnalyzer", key_type=REG_SZ):
    try:
        regPath = ""
        aKey = None
        if is_linux() == True:
            value = loadSpecificValue(path, key, value, type, key_type=key_type)
        else:
            instanceName = getInstanceName()
            regPath = "SOFTWARE\\Commvault Systems\\Galaxy\\{0}\\".format(instanceName)
            regPath += path
            aReg = ConnectRegistry(None, HKEY_LOCAL_MACHINE)
            aKey = OpenKey(aReg, regPath, 0, KEY_ALL_ACCESS)
            value = loadSpecificValue(aKey, key, value, type, key_type=key_type)
    except Exception as e:
        print("There was an error while fetching reg key {}: {}".format(regPath + "\\" + key, e))
    return value


# current usage is really minimal with only usage in CvSpacyBussinessKeywordFilter file
# so we can avoid caching here
def setRegValue(key, value, type=int, path="ContentAnalyzer", key_type=REG_SZ):
    try:
        regPath = ""
        if is_linux() == True:
            regPath = os.path.join(get_reg_path(), path, ".properties")
            reg_text = ""
            with open(regPath) as f:
                text = f.read()
                text = text.strip()
                if len(re.findall(key + "\s+([^\s]+)", text)) == 0:
                    # append the key and value
                    reg_text = text + os.linesep + key + " " + str(value)
                else:
                    # substitute value for the existing key
                    reg_text = re.sub("(" + key + "\s+)([^\s]+)", r"\g<1>" + str(value), text)
            if len(reg_text) > 0:
                with open(regPath, "w") as f:
                    f.write(reg_text)
        else:
            instanceName = getInstanceName()
            regPath = "SOFTWARE\\Commvault Systems\\Galaxy\\{0}\\".format(instanceName)
            regPath += path
            aReg = ConnectRegistry(None, HKEY_LOCAL_MACHINE)
            aKey = OpenKey(aReg, regPath, 0, KEY_ALL_ACCESS)
            SetValueEx(aKey, key, 0, REG_DWORD, value)
    except Exception as e:
        print("There was an error while writing reg key {}: {}".format(regPath + "\\" + key, e))


def getStompPort():
    value = 8055
    key = "messagequeueStompPort"
    return loadRegValue(key, value, int, "MessageQueue")


def checkLogLevel():
    key = REG_CONF["log_level"]["regKey"]
    value = 1
    return loadRegValue(key, value)


def checkGenericLogLevel(module):
    key = module + "_DebugLevel"
    value = list(LOG_LEVELS.keys())[list(LOG_LEVELS.values()).index("INFO")]
    return loadRegValue(key, value, int, "EventManager", key_type=REG_DWORD)


def getBaseDir():
    key = "dBASEHOME"
    value = None
    path = loadRegValue(key, value, str, "Base")
    if path != None:
        return path
    return os.path.dirname(__file__)


def checkRe2(value):
    key = REG_CONF["use_re2"]["regKey"]
    return loadRegValue(key, value, bool)


def getClientName():
    global CLIENT_NAME
    # Use instance name to query registry and get client name
    instanceName = getInstanceName()
    if CLIENT_NAME != None:
        return CLIENT_NAME
    if is_linux() == True:
        try:
            with open(os.path.join(get_cvreg_linux(), "Galaxy", instanceName, ".properties")) as f:
                CLIENT_NAME = re.findall("sPhysicalNodeName\s+([^\s]+)", f.read())[0]
        except Exception as e:
            print(e)
    else:
        aReg = ConnectRegistry(None, HKEY_LOCAL_MACHINE)
        aKey = OpenKey(aReg, "SOFTWARE\\CommVault Systems\\Galaxy\\{0}".format(instanceName))
        try:
            for x in range(0, 100):
                (key, value, typ) = EnumValue(aKey, x)
                if key == "sPhysicalNodeName":
                    CLIENT_NAME = value
        except WindowsError:
            pass
    return CLIENT_NAME if CLIENT_NAME != None else ""


def getLoadedDLL():
    global LOADED_DLL, HELPER_DLL

    if LOADED_DLL == None:

        base_abs_path = getBaseDir()
        if base_abs_path not in os.environ["PATH"]:
            os.environ["PATH"] = "{};{}".format(base_abs_path, os.environ["PATH"])
        LOADED_DLL = cdll.LoadLibrary(os.path.join(base_abs_path, HELPER_DLL))

    return LOADED_DLL


"""
    Decorators, TODO: can be moved to different file later
"""


def generic_cache(
    parent_cache_key=None, cache_prune_interval=30 * 60, cache_inactivity_interval=20 * 60
):
    if parent_cache_key is None:
        parent_cache_key = str(time.time())

    def inner_function(func):
        cache_ = dict()
        cache_access_time_ = dict()

        def timed_cache_pruning():
            for cache_item in list(cache_access_time_.items()):
                last_access_time = cache_item[1]
                if (time.time() - last_access_time) > cache_inactivity_interval:
                    del cache_access_time_[cache_item[0]]
                    del cache_[cache_item[0]]
            Timer(cache_prune_interval, timed_cache_pruning).start()

        timed_cache_pruning()

        @functools.wraps(func)
        def wrapper(*args, **kwargs):
            cache_token = args
            if "cache_token" in kwargs:
                cache_token = kwargs["cache_token"]
            if cache_token in cache_:
                cache_access_time_[cache_token] = time.time()
                return cache_[cache_token]
            output = func(*args, **kwargs)
            cache_[cache_token] = output
            return output

        return wrapper

    return inner_function


def retry(num_times=2, delay=5):
    def inner_function(func):
        @functools.wraps(func)
        def wrapper(*args, **kwargs):
            local_num_times = num_times
            exception_ = None
            while local_num_times > 0:
                try:
                    return func(*args, **kwargs)
                except Exception as e:
                    if "logger" in kwargs:
                        kwargs["logger"].exception(
                            f"Exception occured in executing {func.__name__}"
                        )
                    local_num_times -= 1
                    exception_ = e
                    if local_num_times > 0:
                        time.sleep(delay)
            raise Exception(exception_)

        return wrapper

    return inner_function


"""
    Decorator block End 
"""


@retry(num_times=2, delay=5)
def queryCS(requestType, dcPlanId=None, dcPolicyId=None, retry=True, COMMSERVER_REACHABLE=None, ee_cache_token="default", get_disabled=False):
    # Function to query CS DB using a DLL, for getting
    # list of all entities, and returning the response xml
    if (COMMSERVER_REACHABLE is not None and COMMSERVER_REACHABLE.value) or retry:
        clientName = getClientName()
        instance = getInstanceName()
        if dcPlanId == None and dcPolicyId == None:
            eeDetailsReq = ""
        else:
            if dcPolicyId == None:
                detailsFiller = 'dcPlanID="{}"'.format(dcPlanId)
            elif dcPlanId == None:
                detailsFiller = 'dcPolicyID="{}"'.format(dcPolicyId)
            else:
                detailsFiller = 'dcPlanID="{}" dcPolicyID="{}"'.format(dcPlanId, dcPolicyId)
            eeDetailsReq = "<eEDetailReq {}/>".format(detailsFiller)

        msg = f'<DM2ContentIndexing_EntityDetailsReq getDisabled="{str(get_disabled).lower()}" requestType="{requestType}"><instance instanceName="{instance}"/><entity ruleType ="1" extractingClientName="{clientName}"/>{eeDetailsReq}</DM2ContentIndexing_EntityDetailsReq>'
        content_preview_port = loadRegValue(SOLR_TAGGER_PORT, 22000)
        params = {
            "EntityDetailsReq": msg,
            "eeCacheToken": ee_cache_token.replace(".","") + "_ca_" + str(requestType),
        }
        url = f"http://localhost:{content_preview_port}/CvContentPreviewGenApp/rest/messagequeue/GetEntities"        
        resp = GET_ENTITY_SESSION.get(url, params=params)        
        if resp.status_code == requests.codes.OK:
            return resp.content.decode("utf-8")
        else:
            return ""       
    else:
        return ""


def cleanRawText(text):
    # text = text.replace("\n", " ")
    # text = text.replace("\t", " ")
    # TODO: The line below this should be removed once it's handled by CE
    # text = text.replace('\\','\\\\')
    # text = " ".join(text.split())
    if text.startswith("Message-ID:") == True:
        text = text[text.find(">") + 1 :]
    return text


def cleanRawTextForNER(text):
    text = re.sub(r"[^\x00-\x7F]+", "", text)
    text = text.replace("\n", " \n ")
    text = text.replace("\t", " \t ")
    text = " ".join(text.split(" "))
    if text.startswith("Message-ID:") == True:
        text = text[text.find(">") + 1 :]
    return text


def loadEntities(
    requiredEntities=[],
    dcPlanId=None,
    dcPolicyId=None,
    logger=None,
    retry=False,
    COMMSERVER_REACHABLE=None,
    get_disabled=False,
    ee_cache_token="default",
):
    # Function to call queryCS(), and parse the response
    # XML and store the entities in the global variables
    from cvee_get_entities import get_entities

    try:
        entities_regex = {}
        entities_keys = {}
        entities_names = {}
        entities_parents = {}
        de_range = {}
        entities_keywords = {}
        entities_selected = {}

        limitEntities = False
        if len(requiredEntities) > 0:
            limitEntities = True
        entity_details = []
        try:
            params = {
                "dcPlanId": dcPlanId,
                "dcPolicyId": dcPolicyId,
                "retry": retry,
                "COMMSERVER_REACHABLE": COMMSERVER_REACHABLE,
                "ee_cache_token": ee_cache_token,
                "get_disabled": get_disabled,
            }
            entity_details = get_entities(params)
            if len(entity_details) == 0:
                if logger is not None:
                    logger.exception(f"No entities found in the plan {dcPlanId}")
                raise Exception("No entities found")
        except Exception:
            if COMMSERVER_REACHABLE is not None:
                COMMSERVER_REACHABLE.value = 0
            if logger is not None:
                logger.exception(
                    "Failed to get the entities from database. Please verify that Commserver is reachable."
                )
            raise Exception(
                "Failed to get the entities from database. Please verify that Commserver is reachable."
            )

        if COMMSERVER_REACHABLE is not None:
            COMMSERVER_REACHABLE.value = 1
        for entity in entity_details:
            name = entity.entityName
            try:
                id = int(entity.entityId)
                key = entity.entityKey
                if id in entities_keys:
                    continue
                if limitEntities and id not in requiredEntities:
                    continue
                entity_type = int(entity.entityType)
                entities_names[name] = id
                regex_json = entity.regularExpression
                if len(regex_json) > 0:
                    regex_obj = json.loads(regex_json)
                    if key == None and "entity_key" in regex_obj:
                        key = regex_obj["entity_key"]
                    if "entity_regex" in regex_obj:
                        regex = regex_obj["entity_regex"]
                        entities_regex[id] = regex
                    elif entity_type == 2:
                        entities_regex[id] = ""

                entities_keys[key] = id
                entities_keys[id] = key
                isSelected = entity.isSelected
                if isSelected == "0":
                    entities_selected[key] = False
                else:
                    entities_selected[key] = True
                parent_entity = entity.parentEntityId
                if parent_entity != None:
                    parent_entity = int(parent_entity)
                    if parent_entity not in entities_parents:
                        entities_parents[parent_entity] = []
                    entities_parents[parent_entity].append(id)
                entityXML = entity.entityXML
                if entityXML is not None:
                    keywords = entityXML.keywords
                    if keywords is not None and len(keywords) > 0:
                        entities_keywords[id] = keywords.strip(",").split(",")
                    if entityXML.proximityRange is not None:
                        de_range[id] = int(entityXML.proximityRange)
            except Exception as e:
                if logger is not None:
                    logger.exception(f"Failed to process entity {name}. Exception {e}")
        if entities_keys is None or len(entities_keys) == 0:
            if COMMSERVER_REACHABLE is not None:
                COMMSERVER_REACHABLE.value = 0
            if logger is not None:
                logger.error(
                    "Failed to get the entities from database. Please verify that Commserver is reachable."
                )
            raise Exception(
                "Failed to get the entities from database. Please verify that Commserver is reachable."
            )

        response = {
            "entities_keys": entities_keys,
            "entities_regex": entities_regex,
            "entities_names": entities_names,
            "entities_parents": entities_parents,
            "de_range": de_range,
            "entities_keywords": entities_keywords,
            "entities_selected": entities_selected,
        }

        return response
    except Exception:
        raise


def checkParentAndKill(parent_pid, child_pid):
    global PARENT_ALIVE_CHECK_INTERVAL

    alive = True
    try:
        parent = psutil.Process(parent_pid)
        alive = parent.is_running()
    except:
        alive = False

    if not alive:
        killProcessAndChildren(child_pid)

    Timer(PARENT_ALIVE_CHECK_INTERVAL, checkParentAndKill, (parent_pid, child_pid)).start()


def killProcessAndChildren(parent_pid):
    parent = psutil.Process(parent_pid)
    for child in parent.children(recursive=True):
        try:
            child.kill()
        except:
            pass
    parent.kill()


def calculate_possible_ner_processes():
    num_processes = 1
    is_dummy = False
    try:
        """
            calculate number of NER process that can be spawned based on available memory
            assuming 4 GB per python process
        """
        MEMORY_REQUIRED_PER_PROCESS = 4.0  # GB
        available_memory = psutil.virtual_memory().available / float(2 ** 30)  # GB
        memory_num_processes = (
            round((available_memory - (available_memory * 0.10)) / MEMORY_REQUIRED_PER_PROCESS) - 1
        )

        """
            calculate number of NER process that can be spawned based on available cpu cores
            assuming 6 cores per python process
        """
        CORES_REQUIRED_PER_PROCESS = 6
        avaliable_cores = get_cpu_cores()
        cores_num_processes = round(avaliable_cores / CORES_REQUIRED_PER_PROCESS) - 1
        """
         if available memory is less than what is needed for one NER process
         we will still have one NER process to pick the messages and
         will return insufficient memory error for each document
        """
        num_processes = max(0, min(memory_num_processes, cores_num_processes))
        # restrict the maximum number of processes to 4
        num_processes = min(4, num_processes)
        if num_processes == 0:
            is_dummy = True
            num_processes = 1
    except:
        num_processes = 1
    return num_processes, is_dummy


def get_available_memory():
    return round(psutil.virtual_memory().available / (2 ** 30))


def get_cpu_cores(is_logical=True):
    return psutil.cpu_count(logical=is_logical)


def to_bytes(str_):
    if isinstance(str_, str):
        return str_.encode("utf-8")
    elif isinstance(str_, bytes):
        return str_
    else:
        return None


def to_string(bytes_):
    if isinstance(bytes_, str):
        return bytes_
    elif isinstance(bytes_, bytes):
        return bytes_.decode("utf-8")
    else:
        return None


def clean(content, *postProcessing):
    """
        text clean up methods required in preprocessing of the text
    """
    if postProcessing is not None:
        for fun in postProcessing:
            content = fun(content)
    return content


def removeLineBreaks(text, line_sep=r"\n"):
    """
        replacing carriage return line feed with the line feed only as
        carriage return is not working properly with python regex.
        example,
        re.compile(r"^hello$", flags=re.DOTALL | re.MULTILINE).findall("hi there\r\nhello\r\n")
            --> []
        re.compile(r"^hello$", flags=re.DOTALL | re.MULTILINE).findall("hi there\nhello\n")
            --> ['hello']
    """
    text = text.strip()
    text = re.sub(r"(?:\r\n){1,}", line_sep, text)
    text = re.sub(r"(?:\n){1,}", line_sep, text)
    """
        reason to replace all the newlines with linefeed (unix style)
        in case of single whitespace match with \s, we are getting different behaviour
        on windows and linux,
        >> re.findall("\w+\s\w+","hello\nhi") # linux linefeed
        >> re.findall("\w+\s\w+","hello\r\nhi") # windows carriage return followed by linefeed
        # in windows case \s is only capturing carriage return
    """
    return text


# not in use
def removeTabs(text):
    return text.replace("\t", " ")


def removeMultipleSpaces(text):
    """
        replace one or more spaces and tabs with a single space
    """
    # remove repeated spaces with a single space
    text = re.sub("(?: ){2,}", " ", text)
    # remove tab characters with two spaces
    text = re.sub("(?:\t){1,}", "  ", text)
    text = re.sub(r"(?:\n ){1,}", "\n", text)
    text = re.sub(r"(?: \n){1,}", "\n", text)
    return text


def removeHiddenText(text):
    # Remove Message-ID
    if text.startswith("Message-ID:") == True:
        return text[text.find(">") + 1 :]

    return text


def removeUnicodeCharacters(text):
    return text.encode("utf-8")


def custom_trim(text):
    if isinstance(text, tuple):
        """
            we usually avoid capturing groups in regex pattern,
            but if user is adding capturing groups in custom regex
            then just take the full capture which will be in first tuple
            sometimes we are getting empty value in the first tuple,
            so we are iterating till we are finding first full match
        """
        for entity_text in text:
            if entity_text.strip() != "":
                text = entity_text
                break
    while True:
        # First, remove spaces from beginning and end
        cleaned_text = text.strip()
        # Next, remove commas from beginning and end
        cleaned_text = cleaned_text.strip(",")
        # Finally, remove > from beginning and < from end
        cleaned_text = cleaned_text.lstrip(">")
        cleaned_text = cleaned_text.rstrip("<")
        cleaned_text = cleaned_text.rstrip(".")

        if text == cleaned_text:
            break
        else:
            text = cleaned_text
    return cleaned_text


def get_local_logger(file_name="CvContentAnalyzer.log", mode="a"):
    import logging
    from logging.handlers import RotatingFileHandler

    log_dir = getLogDir()
    handler = RotatingFileHandler(
        filename=f"{log_dir}/{file_name}",
        mode=mode,
        maxBytes=ROTATING_MAX_BYTES,
        backupCount=ROTATING_BACKUP_COUNT,
    )
    FORMAT = (
        "%(process)d %(thread)d %(asctime)-15s %(levelname)s: %(module)s::%(funcName)s: %(message)s"
    )
    logging.basicConfig(
        format=FORMAT, datefmt="%d-%b-%y %H:%M:%S", level=logging.INFO, handlers=[handler]
    )
    return logging


def get_dll_logger(file_name="ContentAnalyzer"):
    from CvCAGenericLogger import get_logger_handler

    logger_options = {
        "ROTATING_BACKUP_COUNT": ROTATING_BACKUP_COUNT,
        "ROTATING_MAX_BYTES": ROTATING_MAX_BYTES,
    }

    logging = get_logger_handler(os.path.join(getBaseDir(), HELPER_DLL), file_name, logger_options)
    return logging


def delete_dict_values(dict_, keys=None):
    if keys == None or len(keys) == 0:
        return
    try:
        for key in keys:
            if key in dict_:
                del dict_[key]
    except:
        pass


def store_model_config(
    model_location,
    test_file=None,
    dependent_files=None,
    predict_method=None,
    load_model_method=None,
):
    """
        model_location, need to make it relative so that models could be transferred easily
        test_file, test file that will be loaded for model prediction
        predict_method, entry point on how to get the prediction out of the model, it should accept a single argument as text
        load_model_method, entry point on how to load the model
    """
    try:
        import yaml

        store_yaml_output = {
            "name": "test_model",
            "entry_points": {
                "file_name": test_file,
                "load_model": {"method": load_model_method},
                "predict": {"method": predict_method},
                "spacy_model_location": get_spacy_model_location(),
            },
        }
        # create the yaml file
        with open(Path(model_location) / Path("model.yaml"), "w") as f:
            _ = yaml.safe_dump(store_yaml_output, stream=f, default_flow_style=False)

        # copy test_file to model_location
        try:
            shutil.copy(test_file, model_location)
            for file_ in dependent_files:
                try:
                    shutil.copy(file_, model_location)
                except:
                    raise Exception(f"Unable to copy {file_}. Exception {e}")
        except Exception as e:
            raise Exception(
                f"Failed to copy test file to location [{model_location}]. Exception {e}"
            )

    except Exception as e:
        raise Exception(
            f"Failed to create the yaml file at location [{model_location}]. Exception {e}"
        )


def load_model_config(model_location):
    try:
        import yaml

        yaml_output = None
        with open(Path(model_location) / Path("model.yaml")) as f:
            yaml_output = yaml.safe_load(f)
        return yaml_output
    except Exception as e:
        raise Exception(f"Failed to load yaml file from location {model_location}. Exception {e}")


def update_training_status(
    entity_id, model_location, training_status=0, additional_attributes=None
):
    """
        update EntityAnnotator table, xml column with model_location
        and current training status
        additional_attributes could include,
        validationSize,
        classifierAccuracy
    """
    try:
        from cvee_get_entities import (
            get_entity,
            update_entity,
            ModelTrainingStatus,
            EEEntityType,
            Error,
        )

        entity_ = get_entity(entity_id)
        if entity_.entityType != str(EEEntityType.ML_MODEL.value):
            return
        entity_.entityXML.classifierDetails.trainingStatus = str(training_status)
        entity_.entityXML.classifierDetails.modelURI = str(model_location)
        if additional_attributes is not None:
            for attr_key, attr_val in additional_attributes.items():
                if attr_key == "errLogMessage":
                    entity_.entityXML.classifierDetails.err = Error()
                    entity_.entityXML.classifierDetails.err.errLogMessage = attr_val
                if attr_key == "errorCode":
                    if entity_.entityXML.classifierDetails.err is None:
                        entity_.entityXML.classifierDetails.err = Error()
                    entity_.entityXML.classifierDetails.err.errorCode = attr_val
                elif attr_key in entity_.entityXML.classifierDetails.__dict__:
                    entity_.entityXML.classifierDetails.__setattr__(attr_key, attr_val)
        # currently all the models are in local storage type, TODO: will change later
        entity_.entityXML.classifierDetails.modelStorageType = "2"
        entity_.entityXML.classifierDetails.datasetStorageType = "2"
        entity_.entityXML.classifierDetails.CAUsedInTraining.lastModelTrainTime = str(
            int(time.time())
        )
        error_code, error_message = update_entity(entity_)
        if error_code != 0:
            raise Exception(error_message)
    except Exception:
        raise


""" 
    Adding couple of custom exception classes for classifier training
    in future will be moved to a new file when significant custom classes are added
"""


class TrainingError(Exception):
    """ Base training error class """

    pass


class NotEnoughData(TrainingError):
    """ raised when training data is not enough to train """

    pass


class ModelNotTrained(Exception):
    pass


class ModelDoesNotExist(Exception):
    pass


class EntityDoesNotExist(Exception):
    pass


class SpacyModelLoadException(Exception):
    def __init__(self, reason):
        super().__init__(f"Failed to load spacy model, reason: {reason}")


def get_spacy_model_location(model_name="en_core_web_lg"):
    python_site_packages_dir = getPythonSitePackagesDir()
    spacy_model_location = loadRegValue(
        SPACY_MODELS_REG_KEY,
        os.path.join(python_site_packages_dir, Path(SPACY_MODELS_MAP[model_name])),
        type=str,
    )
    return spacy_model_location


def load_spacy_model(model_name="en_core_web_lg"):
    """
        load the input spacy model_name if model is present in
        default models directory and we have enough memory to 
        load it into memory. Will raise SpacyModelLoadException
        inn case of error.
    """
    spacy_model_location = get_spacy_model_location(model_name)
    """
        check if availble memory is more than 3 GB to load spacy model
        current spacy model takes around 2.3 GB memory
        TODO: can think of loading the smaller models in case of less memory
    """
    nlp = None
    document_categorizer = None
    if get_available_memory() >= 3:
        import spacy

        nlp = spacy.load(spacy_model_location)
        # document categorizer needs spacy model for content based categorization
    else:
        raise SpacyModelLoadException("Not enough memory to load spacy")

    return nlp
