# -*- coding: utf-8 -*-
import datetime
import os
import re
import string
import sys
import threading
import time
from collections import Counter
from contextlib import contextmanager
from itertools import repeat
from multiprocessing import Pool, Queue, cpu_count, current_process
from operator import itemgetter
from threading import Event, Thread, Timer
from functools import lru_cache

import gevent.monkey
import gevent.pool
import requests
from lxml import etree
from flashtext import KeywordProcessor

import phonenumbers

# gevent.monkey.patch_all()
import ujson
import CvEETimeout
from core import SolrConnection
from CvCAGenericLogger import get_logger_handler
from CvCAPerfCounters import PerformanceCounter
from CvCAProximityConf import ProximityBasedConfidence
from CvEEConfigHelper import (
    CA_ERROR_CODES,
    DOCUMENT_RESPONSE_CODES,
    STOMP_PORT,
    checkParentAndKill,
    checkRe2,
    clean,
    custom_trim,
    loadEntities,
    removeHiddenText,
    removeLineBreaks,
    removeMultipleSpaces,
    removeUnicodeCharacters,
)
from CvEEMsgQueueHandler import GenericMsgQueueCommunicator
from CvEEValidate import *
re_library = re
try:
    import re2
    re_library = re2
except:
    pass

PROCESS_ID = os.getpid()
THREAD_ID = threading.current_thread().ident

MAX_THREADS = 0
CHECK_ENTITIES = False
CHECK_FAILURES = True
HARD_DEBUG = False
PROCESS_IDX = 1
EXTRACT_TIMEOUT = 60
ACTIVEMQ_WAIT_TIMEOUT = 0
NO_DOCS_TO_PROCESS = 100
LOW_PERF = False
PRE_PROCESS_TEXT = True
USER_DEFINED_PROXIMITY_CONF = {}
MIN_CONF = 50

DOCUMENT_CONTENT_SESSION = requests.Session()
DOCUMENT_UPDATE_SESSION = requests.Session()

pid = current_process().ident

@lru_cache()
def get_date_keyword_processor():
    keyword_processor = KeywordProcessor(case_sensitive=False)
    keyword_processor.non_word_boundaries = set(string.ascii_letters + '_')
    for word in "January|February|March|April|May|June|July|August|September|October|November|December|Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Sept|Oct|Nov|Dec)(?: {1,3}\\d{1,2}(?:st|nd|rd|th)?,? {0,3}\\d{2,4})|(?:\\d{1,4},? {0,3}[\\/.-]?)(?:January|February|March|April|May|June|July|August|September|October|November|December|Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Sept|Oct|Nov|Dec".split("|"):
        keyword_processor.add_keyword(word)
    return keyword_processor

@contextmanager
def TimeIt(label, func_str):
    start = time.clock()
    try:
        yield
    finally:
        end = time.clock()
        LOGGER_Client.info("{} : {}".format(label, end - start), func_str)


def getConfiguration():
    global STOMP_PORT
    config = {"q_params": ("127.0.0.1", STOMP_PORT, "EEQueue", "EERQueue")}
    return config


class EEClient(Thread, GenericMsgQueueCommunicator):
    def __init__(self, config, eeHanadler):
        Thread.__init__(self)
        GenericMsgQueueCommunicator.__init__(self)
        self._config = config
        self.host = config["q_params"][0] if "q_params" in config else "127.0.0.1"
        self.stompPort = config["q_params"][1] if "q_params" in config else 61650
        self.EEQueue = "EEQueue"
        self.REEQueue = "EERQueue"
        self.clients = {
            self.EEQueue: {"subscribe": True, "client": None},
            self.REEQueue: {"subscribe": False, "client": None},
        }
        self._stopped = Event()
        self._eeHanadler = eeHanadler
        self.perfCounter = config["perfCounter"]

    def stop(self):
        module_name = "CvEEClient.EEClient"
        func_name = "stop"
        func_str = "{}::{}() - ".format(module_name, func_name)
        LOGGER_Client.info("Stopping the Entity Extraction Client", func_str)
        self._stopped.set()

    def run(self):
        module_name = "CvEEClient.EEClient"
        func_name = "run"
        func_str = "{}::{}() - ".format(module_name, func_name)
        global ACTIVEMQ_WAIT_TIMEOUT
        self.connectToQueue()
        LOGGER_Client.info("Connected to tcp://{}:{}".format(self.host, self.stompPort), func_str)

        while not self._stopped.wait(ACTIVEMQ_WAIT_TIMEOUT):
            try:
                self.perfCounter.start_stopwatch("MsgQueueDequeue")
                frame = self.getFrame(self.clients[self.EEQueue]["client"])
                if frame == None:
                    continue
                self.perfCounter.stop_stopwatch("MsgQueueDequeue")
                self.updateInitialStatus(frame)
                self._eeHanadler(frame)
            except Exception as e:
                self.connected = False
                LOGGER_Client.error(
                    "Disconnected from ActiveMQ. Will reconnect in {} minutes.".format(
                        self.AMQ_CONNECT_WAIT_TIME / 60
                    ),
                    func_str,
                )
                self.connectToQueue()
        LOGGER_Client.debug("Finished polling for analysis tasks. Exiting...", func_str)

    def getContentIds(self, response):
        ids = []
        for doc in response.results:
            ids.append(doc["contentid"])
        return ids

    def getListOfIds(self, message):
        module_name = "CvEEClient"
        func_name = "getListOfIds"
        func_str = "{}::{}() - ".format(module_name, func_name)
        solrurl = message["solrurl"] + "/solr"
        scID = message["sc_batchno"].split(":")[0]
        try:
            conn = SolrConnection(solrurl)
            response = self.getDocsFromBatch(
                conn, scID, message["start"], message["offset"], 1, message["endTime"]
            )
            conn.close()
            ids = self.getContentIds(response)
            return ",".join(ids)
        except Exception as e:
            LOGGER_Client.exception(
                "Couldn't get list of ids because: {0}. Solrurl is {1}".format(e, solrurl), func_str
            )
            return False

        return True

    def getDocsFromBatch(self, conn, scID, start, offset, diff, endTime):
        global CHECK_FAILURES, NO_DOCS_TO_PROCESS
        if CHECK_FAILURES:
            raw_query = "((cistate:1 OR cistate:16) AND (datatype:1 OR datatype:2)) AND apid:{} AND ((extractingat:[{}Z TO {}Z]) OR (entity_state:[1 TO 5]) OR (extractingat:[* TO {}Z] AND (NOT entity_state:*)))"
        else:
            raw_query = "((cistate:1 OR cistate:16) AND (datatype:1 OR datatype:2)) AND apid:{} AND ((extractingat:[{}Z TO {}Z]) OR (extractingat:[* TO {}Z] AND (NOT entity_state:*)))"
        query = raw_query.format(scID, start, endTime, start)
        return conn.query(
            query,
            fields=["contentid"],
            start=offset - ((diff - 1) * NO_DOCS_TO_PROCESS),
            rows=NO_DOCS_TO_PROCESS * diff,
            sort="extractingat",
            sort_order="asc",
        )

    def updateInitialStatus(self, payload):
        payload = ujson.loads(payload)
        ids = self.getListOfIds(payload)
        message = {
            "batchno": payload["batchno"],
            "sc_batchno": payload["sc_batchno"],
            "start": payload["start"],
            "end": payload["end"],
            "endTime": payload["endTime"],
            "solrurl": payload["solrurl"],
            "offset": payload["offset"],
            "ids": ids,
            "flag": "0",
            "pickedUpTime": datetime.datetime.utcnow().isoformat(),
        }

        jsonMsg = ujson.dumps(message, ensure_ascii=False)

        self.perfCounter.start_stopwatch("ClientMsgQueueEnqueueInitial")
        self.sendOnConnect(self.REEQueue, jsonMsg.encode("utf-8"))
        self.perfCounter.stop_stopwatch("ClientMsgQueueEnqueueInitial")

    def updateStatus(self, payload, stop_status):
        message = {
            "batchno": payload["batchno"],
            "sc_batchno": payload["sc_batchno"],
            "start": payload["start"],
            "end": payload["end"],
            "solrurl": payload["solrurl"],
            "offset": payload["offset"],
            "flag": "1",
        }

        jsonMsg = ujson.dumps(message, ensure_ascii=False)

        self.perfCounter.start_stopwatch("ClientMsgQueueEnqueueFinal")
        self.sendOnConnect(self.REEQueue, jsonMsg.encode("utf-8"))
        self.perfCounter.stop_stopwatch("ClientMsgQueueEnqueueFinal")

        if stop_status:
            self.disconnectClients()
            sys.exit()


class EE:
    def __init__(self, config):
        global DOCUMENT_RESPONSE_CODES, LOW_PERF, DOCUMENT_CONTENT_SESSION, DOCUMENT_UPDATE_SESSION
        self.config = config
        if not LOW_PERF:
            self.pool = gevent.pool.Pool(5)
        self.queue = Queue()
        self.docToCommit = False
        self.state = DOCUMENT_RESPONSE_CODES["success"]
        self.framePayLoad = {}
        self.results = {}
        # all the deep validation algorithms are controlled by one regkey sEEDeepValidate
        # These changes are already checked in with Form 73620 in SP17
        self.DEEP_VALIDATION_ENTITIES = {
            "greece_afm": validateGreeceAFM,
            "uk_nhs": validateUKNHS,
            "dutch_ssn": validateDutchSSN,
            "iban": validateIBAN,
            "ccn": validateCCN,
            "pesel": validate_pesel,
            "australia_medical_account": validate_austalia_medical_number,
            "australia_tax_file": validate_australia_tax_file_number,
            "canada_sin": validate_canadian_sin,
            "finland_hetu": validate_finnish_hetu,
            "france_insee": validate_french_insee,
            "ireland_ppsn": validate_irish_ppsn,
            "indian_aadhaar": validate_indian_aadhaar,
            "phone": filter_phone_entity,
            "south_africa_id": validate_south_africa_id,
        }

        self.specialArgs = {}
        self.PROXIMITY_SUPPORTED_PATTERNS = set()
        self.proxConf = None
        self.processed_entities = []
        self.documentContentSession = DOCUMENT_CONTENT_SESSION
        self.resultsPushSession = DOCUMENT_UPDATE_SESSION

    def initProxConfClass(self, logger):
        global USER_DEFINED_PROXIMITY_CONF
        self.proxConf = ProximityBasedConfidence(USER_DEFINED_PROXIMITY_CONF)
        self.proxConf.logger = logger

    def initPerfCounterClass(self, perfCounter):
        self.perfCounter = perfCounter
        self.proxConf.perfCounter = perfCounter

    def getDocumentContent(self, solrurl, contentid):
        module_name = "CvEEClient"
        func_name = "getDocumentContent"
        func_str = "{}::{}() - ".format(module_name, func_name)
        global DOCUMENT_RESPONSE_CODES, PRE_PROCESS_TEXT
        query = "{0}/viewsourceservlet?docid={1}&getfield=zippedcontent".format(solrurl, contentid)
        content = ""
        req = None

        try:
            req = self.documentContentSession.get(query, stream=True)
            req.raw.decode_content = True
            content = req.text  # etree.parse(req.raw).getroot().text
            if PRE_PROCESS_TEXT:
                content = clean(
                    content,
                    removeLineBreaks,
                    removeMultipleSpaces,
                    removeHiddenText,
                    removeUnicodeCharacters,
                )

        except:
            LOGGER_Client.exception("Some exception for contentid {0}".format(contentid), func_str)
            self.state = DOCUMENT_RESPONSE_CODES["previewFailed"]
            self.documentContentSession = requests.Session()
            self.resultsPushSession = requests.Session()

        return content.decode("utf-8")

    @staticmethod
    def verify_solr_format(parsed_dt):
        solr_date_regex = re_library.compile("^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}Z$")
        if len(solr_date_regex.findall(parsed_dt)) > 0:
            return True
        return False

    def refineDate(self, date_string):
        if not validateDate(date_string):
            return ""

        date_string = date_string.strip()
        date_string = date_string.strip("/")
        date_string = date_string.strip(":")
        date_string = date_string.strip("-")
        date_string = date_string.strip()
        # strip extra tokens from date pattern -- not needed as we are removing these extra tokens from the regex itself
        #    EXTRA_TOKENS_PATTERN = 'due|by|on|standard|daylight|savings|time|date|of|to|until|z|at|t'
        #    date_string = re.sub('\\b('+EXTRA_TOKENS_PATTERN+')\\b', "", date_string, flags=re.IGNORECASE)
        # strip any character which is not an alphabet or digit from either side of the date
        date_string = re_library.sub("^[^a-z0-9]+", "", date_string, flags=re.IGNORECASE)
        date_string = re_library.sub("[^a-z0-9]+$", "", date_string, flags=re.IGNORECASE)
        return date_string

    # split_size: split large text, set large number if no split
    def extractDate(self, text, split_size=10240):
        #  TODO: this can be replaced via decorator. Will do it next form.
        module_name = "CvEEClient"
        func_name = "extractDate"
        func_str = "{}::{}() - ".format(module_name, func_name)
        if text:
            import datefinder            

            date_finder = datefinder.DateFinder()
            text_len = len(text)
            step_size = min(split_size, text_len)
            string_set = set()  # filter date_string which has been checked
            cur_pos = 0
            while cur_pos < text_len:
                sliced_text = text[cur_pos : cur_pos + step_size]
                cur_pos += step_size

                try:
                    extracted = date_finder.extract_date_strings(sliced_text)
                    for date_string, _, captures in extracted:
                        if len(date_string) < 8:
                            continue

                        if date_string in string_set:
                            continue
                        string_set.add(date_string)
                        # if not pattern.match(date_string):
                        #     continue
                        if (
                            len(date_string.split(" ")) > 2
                            or len(date_string.split("\\")) > 2
                            or len(date_string.split("/")) > 2
                            or len(date_string.split("-")) > 2
                        ):
                            as_dt = None
                            try:
                                as_dt = date_finder.parse_date_string(date_string, captures)
                                if as_dt is None:
                                    continue
                            except:
                                # in case date parsing fails just skip the current string and move on to next one
                                continue
                            if not date_string in sliced_text:
                                continue
                            s = re.sub("[^0-9]", "", date_string)
                            if len(s) > 4 and s.isdigit():  # filter string that has too less digit
                                date_string = self.refineDate(date_string)
                                if len(date_string) >= 8:
                                    yield (date_string, as_dt.isoformat() + "Z")
                except TypeError as e:
                    LOGGER_Generic.debug(
                        "Date parser failed to parse the string because of TypeError. Exception : {}".format(
                            e
                        ),
                        func_str,
                    )
                except CvEETimeout.Timeout.ProcessingTimedOut as e:
                    raise
                except Exception as e:
                    LOGGER_Generic.debug(
                        "Got an exception during date extraction. Exception : {}".format(e),
                        func_str,
                    )

    def cleanKeywordsForRegex(self, text):
        text = re.sub(r"[^\x00-\x7F]+", "", text)
        return text

    def extractEntitiesViaRegex(
        self, text, pattern, patternKey, patternID, resultQ, id, source, N=-1
    ):
        module_name = "CvEEClient"
        func_name = "extractEntitiesViaRegex"
        func_str = "{}::{}() - ".format(module_name, func_name)

        try:
            global ENTITIES_KEYS, LOGGER_Generic, ENTITIES_KEYWORDS
            text = text.replace("\\\\", "\\")
            resultArr = []
            parsed_dt_arr = []
            resultStr = None

            try:
                self.perfCounter.start_stopwatch(patternID)
            except:
                pass

            if pattern == None:
                return
            elif (
                patternID == "phone"
                and patternKey not in ENTITIES_KEYWORDS
                and self.specialArgs["phone_entity_leniency"] > 0
            ):
                """ 
                handling phone number separately with phonenumbers library
                in case phone entity is having keywords, go via old regex route to calculate proximity confidence
                TODO: may need to change this to support the keywords proximity on top of phonenumbers matcher
                """
                for match in phonenumbers.PhoneNumberMatcher(
                    text, "US", leniency=self.specialArgs["phone_entity_leniency"]
                ):
                    resultArr.append(match.raw_string.strip())

            elif patternID == "usdl" or patternID == "iban":
                if "numbers" in pattern:
                    if patternID == "usdl":
                        if (
                            patternKey in ENTITIES_KEYWORDS
                            and len(ENTITIES_KEYWORDS[patternKey]) > 0
                        ):
                            keywords = ENTITIES_KEYWORDS[patternKey]
                            keywords_regex = "(?:(?<=[\\s\\,\\>\\<])|^)(?:{})(?=[\\s\\,\\<\\.\\>]|$).{{0,300}}".format(
                                self.cleanKeywordsForRegex("|".join(keywords))
                            )
                        else:
                            return
                    else:
                        keywords_regex = ""
                        # in case of iban get all the text with generic regex first
                        iban_global_pattern = "(?:[\\s\\,\\>]|\\b)[A-Z]{2}\\d{2}[0-9A-Z \\t\\-]{11,34}(?:[\\s\\,\\>]|\\b)"
                        iban_global_regex = re_library.compile(
                            iban_global_pattern, re.DOTALL | re.MULTILINE
                        )
                        generic_iban_text_list = [
                            custom_trim(r) for r in iban_global_regex.findall(text)
                        ]
                        generic_iban_texts = "\n".join(generic_iban_text_list)

                    for key in pattern["numbers"]:
                        regEx = re_library.compile(
                            keywords_regex + "(" + pattern["numbers"][key] + ")",
                            re.DOTALL | re.IGNORECASE | re.MULTILINE,
                        )
                        if patternID == "usdl":
                            result = [custom_trim(r) for r in regEx.findall(text)]
                        else:
                            result = [custom_trim(r) for r in regEx.findall(generic_iban_texts)]
                        counter = Counter(result)
                        for r, _ in counter.most_common():
                            output_str = "{0}:{1}".format(key, r)
                            if patternID == "iban":
                                output_str = r
                            if (
                                patternID in self.DEEP_VALIDATION_ENTITIES
                                and "deep_validate" in self.specialArgs
                                and self.specialArgs["deep_validate"] == True
                            ):
                                if self.DEEP_VALIDATION_ENTITIES[patternID](custom_trim(r)):
                                    resultArr.append(output_str)
                            else:
                                resultArr.append(output_str)

            elif self.specialArgs["extract_all_date_formats"] == 1 and patternID == "date":
                LOGGER_Generic.error("using datefinder")
                dates = self.extractDate(text)
                for d, parsed_dt in dates:
                    resultArr.append(d)
                    if EE.verify_solr_format(parsed_dt) is True:
                        parsed_dt_arr.append(parsed_dt)
            
            elif patternID == "date":
                numeric_date_regex = re_library.compile(pattern["numeric_date_regex"], re.DOTALL | re.IGNORECASE | re.MULTILINE)
                date_keyword_regex = re_library.compile(pattern["date_keyword_regex"], re.DOTALL | re.IGNORECASE | re.MULTILINE)
                
                dates_found = []                
                dates_found = dates_found + numeric_date_regex.findall(text)                
                keyword_processor = get_date_keyword_processor()
                date_keywords = keyword_processor.extract_keywords(text, span_info=True)
                reduced_text_list = []
                text_len = len(text)                
                for date_keyword, start, end in date_keywords:                    
                    new_text = ""                    
                    for idx in range(1,11):                        
                        if start-idx > 0 and text[start-idx] not in ("\n", "\r\n", os.linesep):
                            new_text+= text[start-idx]
                        else:
                            break
                    new_text=new_text[::-1]                    
                    new_text+=text[start:end]
                    for idx in range(0,11):                        
                        if end+idx < text_len and text[end+idx] not in ("\n", "\r\n", os.linesep):
                            new_text+= text[end+idx]
                        else:
                            break                    
                    reduced_text_list.append(new_text)
                reduced_text = " ".join(reduced_text_list)
                dates_found = dates_found + date_keyword_regex.findall(reduced_text)
                resultArr = []                
                validated_date = dict()
                for match in dates_found:
                    match = custom_trim(match)                    
                    if match in validated_date and validated_date[match]:
                        resultArr.append(match)                        
                        continue
                    if validate_date(match):
                        validated_date[match] = True
                        resultArr.append(match)     
                    else:
                        validated_date[match] = False                    

            elif patternKey in ENTITIES_KEYWORDS and len(ENTITIES_KEYWORDS[patternKey]) > 0:
                keywords = ENTITIES_KEYWORDS[patternKey]
                keywords.sort()
                stringified_keywords = "|".join([re.escape(keyword) for keyword in keywords])

                keywords_regex = (
                    "(?:(?:[\\s\\,\\>]|\\b)(?:" + stringified_keywords + ")(?:[\\s\\,\\<\\.]|\\b))"
                )

                if len(pattern) > 0:

                    self.PROXIMITY_SUPPORTED_PATTERNS.add(patternID)

                    if type(pattern) == type({}) and "entity_regex_loose" in pattern:
                        final_pattern = "(?P<loose>" + pattern["entity_regex_loose"] + ")"
                        if "entity_regex_strict" in pattern:
                            final_pattern += "|(?P<strict>" + pattern["entity_regex_strict"] + ")"
                    elif isinstance(pattern, str):
                        final_pattern = "(?P<loose>" + pattern + ")"
                    else:
                        return                    
                    entities = self.proxConf.proximalMatch(
                        final_pattern, keywords_regex, text, patternID
                    )
                    """
                    validate credit card numbers found using loose regex (i.e. without spaces)
                    by Luhn algorithm
                    All deep validation logic is moved to under one registry key "sEEDeepValidate"
                    """
                    # This code path is commented in SP17 and will be backported to SP16 as well
                    # but we need to handle this breakage for SP15 where this code path will be active.
                    # if patternID == "ccn":
                    #     for entity_val, _ in list(entities.items()):
                    #         if self.validateCCN(entity_val) == False:
                    #             del entities[entity_val]

                    values = set()
                    conf = []
                    bucket = []
                    card_types = []
                    for entity_tuple, entity_object in list(entities.items()):
                        entity_text = entity_tuple[0]
                        if (
                            patternID in self.DEEP_VALIDATION_ENTITIES
                            and "deep_validate" in self.specialArgs
                            and self.specialArgs["deep_validate"] == True
                            and not self.DEEP_VALIDATION_ENTITIES[patternID](
                                custom_trim(entity_text)
                            )
                        ):
                            continue
                        # in case a phone entity is starting with a + prefix (which is usually added for country codes)
                        # boost the confidence score so that we can return these phone numbers as valid entities
                        if patternID == "phone" and entity_text[0] == "+":
                            phone_entity_conf = entity_object["conf"]
                            entity_object["conf"] = (
                                phone_entity_conf
                                if float(phone_entity_conf) > MIN_CONF
                                else (MIN_CONF + 1.0)
                            )
                        if float(entity_object["conf"]) > MIN_CONF:
                            # This code path is for Offline CI
                            if entity_text not in values:
                                values.add(entity_text)
                                conf.append("{0:.2f}".format(entity_object["conf"]))
                                bucket.append(entity_object["bucket"])
                                # if patternID == "ccn":
                                #     card_types.append(entity_object["card_type"])

                            # This code path is for CA/GDPR
                            entity_details = entity_object.copy()
                            entity_details["entity"] = entity_text
                            entity_details["conf"] = "{0:.2f}".format(entity_details["conf"])

                            resultArr.append(entity_details)
                        else:
                            del entities[entity_tuple]

                    # Doing it this way because the format() function
                    # does not support unicode characters

                    values_string = ",".join(list(values))
                    conf_string = ",".join(conf)
                    bucket_string = ",".join([str(b) for b in bucket])
                    # card_types_string = ",".join(card_types)

                    resultStr = (
                        values_string
                        + ":"
                        + conf_string
                        + ":"
                        + bucket_string
                        # + ":"
                        # + card_types_string
                    )
                    result_count = len(values)

                else:
                    
                    regex = re_library.compile(keywords_regex, re.DOTALL | re.IGNORECASE | re.MULTILINE)

                    result = regex.findall(text)
                    counter = Counter(result)

                    for r, count in counter.most_common():
                        resultArr.append(r)

            else:

                regEx = None

                """  
                in case all the keywords are removed for a system defined entity which is defined in below format
                {
                    "entity_regex_strict": "",
                    "entity_regex_loose": "",
                }
                using strict regex in this case as loose regex may lead to lots of false positive
                """
                if type(pattern) == type({}) and "entity_regex_strict" in pattern:
                    pattern = pattern["entity_regex_strict"]
                elif type(pattern) == type({}) and "entity_regex_loose" in pattern:
                    pattern = pattern["entity_regex_loose"]
                elif type(pattern) == type(
                    {}
                ):  # hopefully this case will never be there but to be on the safe side
                    pattern = ""

                if (
                    patternID == "finance_tags"
                    and "use_re2" in self.specialArgs
                    and self.specialArgs["use_re2"] == True
                ):
                    regEx = re_library.compile(pattern, re2.DOTALL | re2.IGNORECASE | re2.MULTILINE)
                  
                if regEx == None:
                    try:
                        regEx = re_library.compile(pattern, re.DOTALL | re.IGNORECASE | re.MULTILINE)
                    except:
                        if source == "internal":
                            LOGGER_Generic.exception(
                                "The regex for pattern {} failed to compile. Setting document state as failed".format(
                                    patternID
                                ),
                                func_str,
                            )
                            self.state = DOCUMENT_RESPONSE_CODES["invalidRegex"]
                        else:
                            self.results["ErrorCode"] = CA_ERROR_CODES["RegexError"]
                            self.results[
                                "ErrorMessage"
                            ] = "Regex compile failed for pattern {}".format(patternID)
                            LOGGER_Generic.exception(
                                "Pattern {} for contentid {} failed during regex compilation".format(
                                    patternID, id
                                ),
                                func_str,
                            )
                            return

                if regEx != None:
                    if patternID == "phone":
                        result = regEx.findall(text)
                    else:
                        result = [custom_trim(r) for r in regEx.findall(text)]

                    """
                    Duplicated from above to handle no keyword use case
                    validate credit card numbers found using loose regex (i.e. without spaces)
                    by Luhn algorithm
                    All deep validation logic is moved to under one registry key "sEEDeepValidate"
                    """
                    # if patternID == "ccn":
                    #     filtered_entities = []
                    #     for entity_val in result:
                    #         if self.validateCCN(entity_val) == True:
                    #             filtered_entities.append(entity_val)
                    #     result = filtered_entities

                    if (
                        patternID in self.DEEP_VALIDATION_ENTITIES
                        and "deep_validate" in self.specialArgs
                        and self.specialArgs["deep_validate"] == True
                    ):
                        result = [
                            entity
                            for entity in result
                            if self.DEEP_VALIDATION_ENTITIES[patternID](custom_trim(entity))
                        ]

                    # verify Polish pesel number using check digit validation
                    # if patternID == "pesel":
                    #     result = [entity for entity in result if self.validate_pesel(entity)]

                    counter = Counter(result)

                    # Custom result string
                    if patternID == "phone":
                        resultArr = self.processPhoneResult(counter)
                        result_count = len(resultArr)

                    else:
                        for r, count in counter.most_common():
                            resultArr.append(r)

            resultArr_dict = {}
            resultArr_filtered = list()
            for entity in resultArr:
                entity_text = entity
                if type(entity_text) == dict:
                    entity_text = entity["entity"]
                if entity_text.lower() not in resultArr_dict:
                    resultArr_dict[entity_text.lower()] = entity
                    # offline CI needs only unique entities and count, case insensitive
                    if source == "internal":
                        resultArr_filtered.append(entity_text)
                if source != "internal":
                    resultArr_filtered.append(resultArr_dict[entity_text.lower()])

            if resultStr == None:
                resultStr = ",".join(resultArr_filtered)
                result_count = len(resultArr_filtered)
            self.processed_entities.append(patternID)

            if source == "internal":
                resultQ.put((patternID, resultStr, result_count), block=True)
            else:
                LOGGER_Generic.debug(
                    "Finished extracting pattern {} for contentid {}".format(patternID, id),
                    func_str,
                )

                self.results[patternID] = resultArr_filtered
                if len(parsed_dt_arr) > 0:
                    # LOGGER_Generic.debug("Parsed dates {}".format(parsed_dt_arr))
                    self.results["dt_" + patternID] = parsed_dt_arr
                    if "skipFields" not in self.results:
                        self.results["skipFields"] = ["dt_" + patternID]
                    else:
                        self.results["skipFields"].append("dt_" + patternID)

            try:
                extracted_entities_count = 0
                if self.results is not None and patternID in self.results:
                    extracted_entities_count = len(self.results[patternID])
                self.perfCounter.stop_stopwatch(
                    patternID, size=len(text), entity_count=extracted_entities_count
                )
            except:
                pass
        except CvEETimeout.Timeout.ProcessingTimedOut as e:
            raise
        except:
            if source == "internal":
                LOGGER_Client.exception(
                    "Error while extracting {} for contentid {}".format(patternID, id), func_str
                )
            else:
                LOGGER_Generic.exception(
                    "Error while extracting {} for contentid {}".format(patternID, id), func_str
                )

    def extractEntities(self, text, entities, id, solrurl, eeType="regex", source="internal"):
        module_name = "CvEEClient"
        func_name = "extractEntities"
        func_str = "{}::{}() - ".format(module_name, func_name)

        """
            This function accepts a doc id, content to be parsed, list of entities,
            solrurl and source (either from CI or DataCube). For each entity type
            in the list of entities, it spawns a greenlet thread to parse the content
            and match regex for the particular entity type (using the 
            extractEntitiesViaRegex() function). 
        """

        global CHECK_ENTITIES, DOCUMENT_RESPONSE_CODES, ENTITIES_REGEX, LOW_PERF, LOGGER_Generic, CA_ERROR_CODES
        try:
            if eeType == "regex":
                if solrurl is not None:
                    conn = SolrConnection(solrurl + "/solr")
                if source == "internal":
                    resultQ = self.queue
                    while not resultQ.empty():
                        resultQ.get()
                else:
                    resultQ = None

                for patternID in entities:
                    if patternID not in ENTITIES_REGEX or ENTITIES_REGEX[patternID] == None:
                        continue
                    if CHECK_ENTITIES and solrurl is not None and source == "internal":
                        query = "id:%s" % id
                        response = conn.query(
                            query, fields=["entity_" + ENTITIES_KEYS[patternID]], rows=1
                        )
                        result = response.results

                        if "entity_" + ENTITIES_KEYS[patternID] not in response.results[0]:
                            if not LOW_PERF:
                                self.pool.spawn(
                                    self.extractEntitiesViaRegex,
                                    text,
                                    ENTITIES_REGEX[patternID],
                                    patternID,
                                    ENTITIES_KEYS[patternID],
                                    resultQ,
                                    id,
                                    source,
                                )
                            else:
                                self.extractEntitiesViaRegex(
                                    text,
                                    ENTITIES_REGEX[patternID],
                                    patternID,
                                    ENTITIES_KEYS[patternID],
                                    resultQ,
                                    id,
                                    source,
                                )
                        else:
                            resultQ.put((ENTITIES_KEYS[patternID], "", 0), block=True)
                    else:
                        if source != "internal":
                            LOGGER_Generic.debug(
                                "About to extract entity {} for contentid {}".format(
                                    ENTITIES_KEYS[patternID], id
                                ),
                                func_str,
                            )
                        if not LOW_PERF:
                            self.pool.spawn(
                                self.extractEntitiesViaRegex,
                                text,
                                ENTITIES_REGEX[patternID],
                                patternID,
                                ENTITIES_KEYS[patternID],
                                resultQ,
                                id,
                                source,
                            )
                        else:
                            self.extractEntitiesViaRegex(
                                text,
                                ENTITIES_REGEX[patternID],
                                patternID,
                                ENTITIES_KEYS[patternID],
                                resultQ,
                                id,
                                source,
                            )

                if not LOW_PERF:
                    self.pool.join(raise_error=True)
        except CvEETimeout.Timeout.ProcessingTimedOut as e:
            raise
        except Exception as e:
            self.state = DOCUMENT_RESPONSE_CODES["solrRequestFailed"]
            if source == "internal":
                LOGGER_Client.exception(
                    "There was an exception while parsing regex or querying Solr for doc {} ".format(
                        id
                    ),
                    func_str,
                )
            else:
                return {
                    "ErrorCode": CA_ERROR_CODES["RERError"],
                    "ErrorMessage": "Exception while parsing regex for the document: {}".format(e),
                }

    def commit(self):
        module_name = "CvEEClient"
        func_name = "commit"
        func_str = "{}::{}() - ".format(module_name, func_name)
        if self.docToCommit == True:
            query = "{0}/solr/update?commit=true".format(self.config["solrurl"])
            req = None
            try:
                req = self.documentContentSession.get(query)
                LOGGER_Client.info("Commit run", func_str)
                self.docToCommit = False
            except Exception as e:
                LOGGER_Client.exception(f"Exception during commit. {e}", func_str)
                self.documentContentSession = requests.Session()
            finally:
                if req != None:
                    req.close()

        Timer(10, self.commit).start()

    def pushEntities(self, solrurl, contentid, no_patterns):
        module_name = "CvEEClient"
        func_name = "pushEntities"
        func_str = "{}::{}() - ".format(module_name, func_name)

        global PROCESS_IDX, DOCUMENT_RESPONSE_CODES

        entity_count_obj = {}

        parameters = {}
        parameters["operationtype"] = "updatefields"
        parameters["q"] = "contentid:{0}".format(contentid)

        fields = ""
        for i in range(0, no_patterns):
            try:
                (patternID, resultStr, result_count) = self.queue.get(True)

                if result_count > 0:
                    fields += "entity_{0},entity_count_{0},".format(patternID)
                    parameters["entity_count_" + patternID] = result_count

                    if patternID in self.PROXIMITY_SUPPORTED_PATTERNS:
                        resultArr = resultStr.split(":")
                        values = resultArr[0]
                        conf = resultArr[1]
                        parameters["entity_" + patternID] = values
                        parameters["entity_conf_" + patternID] = conf
                        # if patternID == "ccn":
                        #     card_types = resultArr[3]
                        #     parameters["entity_" + patternID + "_card_types"] = card_types
                        #     fields += "entity_{}_card_types,".format(patternID)
                        entity_count_obj[patternID] = result_count
                        fields += "entity_conf_{},".format(patternID)
                    else:
                        parameters["entity_" + patternID] = resultStr
                        entity_count_obj[patternID] = result_count
            except:
                self.state = DOCUMENT_RESPONSE_CODES["queueException"]
                LOGGER_Client.exception(
                    "Exception for pattern {} for contentid {} while receiving from queue. The entity state for the document will be marked as {}".format(
                        patternID, contentid, self.state
                    ),
                    func_str,
                )

        fields += "entity_extractedat,entity_state"
        parameters["entity_extractedat"] = datetime.datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%SZ")
        parameters["entity_state"] = self.state
        parameters["fields"] = fields

        # To handle all sizes of data, we're going to post the data as
        # multipart/form-data instead of the default
        # application/x-www-form-urlencoded. To do this, we need
        # to change format of the data to be posted.
        # The 'None' signifies filename, which is None because we're
        # not posting any file, just simulating the behavior

        params_as_file = {}
        for key, val in list(parameters.items()):
            if type(val) == type(0):
                params_as_file[key] = (None, str(val))
            else:
                params_as_file[key] = (None, val, "text/plain; charset=utf-8")

        req = None
        try:
            url = "{0}/solr/updatedoc".format(solrurl)
            req = self.resultsPushSession.post(url, files=params_as_file)
            LOGGER_Client.info(
                "Update for doc {0} returned {1}".format(contentid, req.status_code), func_str
            )
            LOGGER_Client.debug(
                "Counts of entities extracted for doc {} are as follows: {}".format(
                    contentid, entity_count_obj
                ),
                func_str,
            )
        except:
            LOGGER_Client.exception(
                "Could not push to Solr for contentid {0}".format(contentid), func_str
            )

    def processPhoneResult(self, counter):
        module_name = "CvEEClient"
        func_name = "processPhoneResult"
        func_str = "{}::{}() - ".format(module_name, func_name)

        resultArr = []

        for (g1, g2, g3, g4, g5, g6), count in counter.most_common():
            resultArr.append(g1)

        return resultArr


def EEandPush(document, entities, solrurl, perfCounter):
    module_name = "CvEEClient"
    func_name = "EEandPush"
    func_str = "{}::{}() - ".format(module_name, func_name)
    # LOGGER_Client.error("inside EEandPush", func_str)
    global USE_RE2, DEEP_VALIDATION, PHONE_ENTITY_LENIENCY, EXTRACT_ALL_DATE_FORMATS
    LOGGER_Client.info("{0} {1} {2}".format(document, entities, solrurl), func_str)
    ee = EE(getConfiguration())
    ee.specialArgs["use_re2"] = USE_RE2
    ee.specialArgs["deep_validate"] = DEEP_VALIDATION
    ee.specialArgs["phone_entity_leniency"] = PHONE_ENTITY_LENIENCY
    ee.specialArgs["extract_all_date_formats"] = EXTRACT_ALL_DATE_FORMATS
    ee.initProxConfClass(LOGGER_Client)
    ee.initPerfCounterClass(perfCounter)
    ee.perfCounter.start_stopwatch("ContentFetch")
    content = ee.getDocumentContent(solrurl, document)
    ee.perfCounter.stop_stopwatch("ContentFetch", len(content))
    ee.perfCounter.start_stopwatch("EntityExtraction")
    ee.extractEntities(content, entities, document, solrurl)
    ee.perfCounter.stop_stopwatch("EntityExtraction", len(content))
    ee.perfCounter.start_stopwatch("SolrUpdate")
    ee.pushEntities(solrurl, document, len(ee.processed_entities))
    ee.perfCounter.stop_stopwatch("SolrUpdate")
    del content
    del ee
    return pid


def doSoftCommit(solrurl):
    module_name = "CvEEClient"
    func_name = "doSoftCommit"
    func_str = "{}::{}() - ".format(module_name, func_name)
    # http://127.0.0.1:27000/solr/update?commit=true&softcommit=true
    if solrurl == None or len(solrurl) == 0:
        return
    query = "{0}/solr/update?commit=true&softcommit=true".format(solrurl)
    try:
        req = self.documentContentSession.get(query)
        LOGGER_Client.info("Softcommit returned {0}".format(req.status_code), func_str)
    except Exception as e:
        LOGGER_Client.exception(f"Exception occurred during soft commit. {e}", func_str)
        self.documentContentSession = requests.Session()
    req.close()


class EEHandler:
    def __init__(self, config):
        self._config = config
        self._message_number = 0
        self.NUMBER_OF_PROCESSES = cpu_count() if cpu_count() >= 4 else 4
        self.NUMBER_OF_PROCESSES = 10  # FOR DEBUGGING
        self.result_list = []
        self.SOFT_COMM_LIMIT = 10000
        self.SOFT_COMM_TIME = 5 * 6
        self.PROCESSED_DOCS = 0
        self.solrurl = None
        self.PROCESSED_BATCHES = 0
        self.STOP = False
        self.perfCounter = config["perfCounter"]
        # self.processPool = Pool(initializer=processInit,processes=self.NUMBER_OF_PROCESSES*5)

    def getDocuments(self, solrResponse):
        while len(solrResponse.results) > 0:
            r = solrResponse.results
            yield r
            solrResponse = solrResponse.next_batch()

    def softCommit(self):
        if self.PROCESSED_DOCS > 0:
            doSoftCommit(self.solrurl)
            self.PROCESSED_DOCS = 0
        Timer(self.SOFT_COMM_TIME, self.softCommit).start()

    def stop(self):
        module_name = "CvEEClient.EEHandler"
        func_name = "stop"
        func_str = "{}::{}() - ".format(module_name, func_name)
        LOGGER_Client.info("Stopping the Entity Extraction Handler", func_str)
        self.STOP = True
        # self.processPool.terminate()
        # self.processPool.close()
        # self.processPool.join()

    def setUpdateFunction(self, fun):
        self._updateFun = fun

    def processMessage(self, stompFrame):
        module_name = "CvEEClient"
        func_name = "processMessage"
        func_str = "{}::{}() - ".format(module_name, func_name)
        global MAX_THREADS, EXTRACT_TIMEOUT, DOCUMENT_RESPONSE_CODES

        with TimeIt("Job finished in ", func_str):
            self._message_number += 1
            payload = ujson.loads(stompFrame)
            LOGGER_Client.info("Processing [%d]: [%s]" % (self._message_number, payload), func_str)

            # conn = SolrConnection(self._config['solrurl']+'/solr')
            counter = 0
            threads = []
            for document in payload["ids"]:
                try:
                    # LOGGER_Client.error("Docod is {0}".format(document), func_str)
                    # LOGGER_Client.error("Docod has  {0} ".format(type(document)), func_str)
                    # pool.apply_async(self.doEE,args=(documents),callback = self.onProcessFinish)
                    # import pdb; pdb.set_trace()

                    t = Thread(
                        target=EEandPush,
                        args=(document, payload["entities"], payload["solrurl"], self.perfCounter),
                    )
                    # t = Thread(target=EEandPush,args=(document['contentid'],))
                    threads.append(t)
                    t.start()
                    counter += 1
                    self.PROCESSED_DOCS += 1
                    if counter == (MAX_THREADS - 1):
                        for t in threads[:]:
                            t.join(timeout=EXTRACT_TIMEOUT)
                            threads.remove(t)
                        counter = 0
                    # self.processPool.map(EEandPush1,
                    #        zip(document,repeat([payload['entities'],payload['solrurl']])))
                except:
                    LOGGER_Client.exception("*** Exception ****", func_str)
                # pool.map(func_star, itertools.izip(documents, itertools.repeat(self._config)))
            if threads:
                for t in threads:
                    t.join()

                # self.processPool.close()
                # self.processPool.join()
            if self.PROCESSED_DOCS >= self.SOFT_COMM_LIMIT:
                doSoftCommit(payload["solrurl"])
                self.PROCESSED_DOCS = 0

            # self._updateFun(payload['batchno'],payload['start'],payload['end'])
            self.PROCESSED_BATCHES += 1
            LOGGER_Client.info("Writing back the status on to REEQueue", func_str)
            self._updateFun(payload, self.STOP)

        # LOGGER_Client.error(self.result_list, func_str)


class CommandManager(Thread):
    def __init__(self, commandQ, *terminatorChain):
        Thread.__init__(self)
        self._terminatorChain = terminatorChain
        self._commandQ = commandQ

    def doStop(self):
        if self._terminatorChain is not None:
            for terminatorFun in self._terminatorChain:
                terminatorFun()

    def run(self):
        if self._commandQ == None:
            return
        try:
            while True:
                command = self._commandQ.get()
                if command == "stop":
                    self.doStop()
                    break
        except KeyboardInterrupt:
            self.doStop()


def preProcess(params={}):
    return {"ErrorCode": CA_ERROR_CODES["success"]}


# @profile
def doAnalysis(processing_input, params={}):
    module_name = "CvEEClient"
    func_name = "doAnalysis"
    func_str = "{}::{}() - ".format(module_name, func_name)
    global ENTITIES_REGEX, LOGGER_Generic, LOW_PERF, ENTITIES_KEYS, CA_ERROR_CODES, USER_DEFINED_PROXIMITY_CONF, ENTITIES_KEYWORDS
    global ENTITIES_GLOBAL_MAP
    try:
        if "logger" in params:
            LOGGER_Generic = params["logger"]
        else:
            return {
                "ErrorCode": CA_ERROR_CODES["loggerNotFound"],
                "ErrorMessage": "Logger object was not sent in params",
            }

        config = getConfiguration()
        ee = EE(config)

        if "proximity_conf" in params:
            USER_DEFINED_PROXIMITY_CONF = params["proximity_conf"]
            ee.initProxConfClass(LOGGER_Generic)

        ee.initPerfCounterClass(params["perfCounter"])
        params["perfCounter"].start_stopwatch("TotalRER")
        params["perfCounter"].start_stopwatch("RERPreProcess")

        ENTITIES_KEYS = params["entities_attributes"]["entities_keys"]
        ENTITIES_REGEX = params["entities_attributes"]["entities_regex"]
        ENTITIES_KEYWORDS = params["entities_attributes"]["entities_keywords"]

        for key, val in list(params.items()):
            if key != "entitiesToExtractRER":
                ee.specialArgs[key] = val
            else:
                params["entitiesToExtractRER"] = [
                    x.strip() for x in params["entitiesToExtractRER"].strip(",").split(",")
                ]

        entities = []
        contentId = ""
        keys = list(ENTITIES_REGEX.keys())

        if "low_perf" in params:
            LOW_PERF = params["low_perf"]
        if "contentid" in params:
            contentId = params["contentid"]

        childDocs = False
        if "childDocs" in params:
            childDocs = params["childDocs"]

        if params is not None and "entitiesToExtractRER" in params:
            for e in params["entitiesToExtractRER"]:
                if e != "":
                    if int(e) in keys:
                        entities.append(int(e))
                    else:
                        keys = ENTITIES_KEYS
                        if int(e) in keys:
                            entities.append(int(e))

        if len(entities) == 0:
            entities = keys

        # drop duplicate entities from the list
        entities = list(set(entities))

        # First parse content to JSON
        try:
            if "EntityExtractionFields" in params:
                params["EntityExtractionFields"] = (
                    params["EntityExtractionFields"].strip(",").split(",")
                )
            else:
                params["EntityExtractionFields"] = ["content"]

            jsonified_input = ujson.loads(processing_input)
            stripped_input = ""
            if "dataList" in jsonified_input and "optTypedata" in jsonified_input["dataList"]:
                for item in jsonified_input["dataList"]["optTypedata"]:
                    if (
                        "attrKey" in item
                        and "attrValue" in item
                        and item["attrKey"] in params["EntityExtractionFields"]
                    ):
                        stripped_input += (
                            f"{item['attrKey']}{os.linesep}" if item["attrKey"] != "content" else ""
                        ) + f"{item['attrValue']}{os.linesep}"
        except Exception as e:
            LOGGER_Generic.exception(
                "There was an error while parsing the JSON of the temp file for doc {}".format(
                    contentId
                )
            )
            stripped_input = processing_input

        if "pre_process_text" not in params or params["pre_process_text"]:
            clean_input = clean(
                stripped_input, removeLineBreaks, removeMultipleSpaces, removeHiddenText
            )
        else:
            clean_input = stripped_input

        params["perfCounter"].stop_stopwatch("RERPreProcess")

        ee.extractEntities(clean_input, entities, contentId, None, source="external")

        params["perfCounter"].start_stopwatch("RERPostProcess")

        final_segregated_results = {0: {}, 1: {}, 2: {}}
        final_results = {}
        results = ee.results
        entity_count_obj = {}
        default_special_keys = ["usdl", "date", "ErrorCode", "ErrorMessage"]
        differently_processed_keys = default_special_keys[:]
        differently_processed_keys.extend(ee.PROXIMITY_SUPPORTED_PATTERNS)
        for key, values in list(results.items()):
            try:
                if len(values) == 0:
                    del results[key]
                else:
                    if key not in differently_processed_keys:
                        results[key] = []
                        for v in values:
                            res = {"value": v, "conf": USER_DEFINED_PROXIMITY_CONF["upper_conf"]}
                            results[key].append(res)
                            if key not in final_results:
                                final_results[key] = []
                                final_segregated_results[2][key] = []
                            final_results[key].append(v)
                            final_segregated_results[2][key].append(v)
                        entity_count_obj[key] = len(values)
            except:
                final_results["ErrorCode"] = results["ErrorCode"]
                final_results["ErrorMessage"] = results["ErrorMessage"]
                LOGGER_Generic.exception("Unhandled entity type", func_str)

        for key in list(ee.PROXIMITY_SUPPORTED_PATTERNS)[:]:
            if key in results:
                if len(results[key]) > 0:
                    for entity in results[key]:
                        if key not in final_segregated_results[entity["bucket"]]:
                            final_segregated_results[entity["bucket"]][key] = []

                        if float(entity["conf"]) > MIN_CONF:
                            if key not in final_results:
                                final_results[key] = []
                            final_results[key].append(entity["entity"])
                        final_segregated_results[entity["bucket"]][key].append(entity["entity"])

                    entity_count_obj[key] = len(results[key])
                else:
                    del results[key]

        if "date" in results:
            final_results["date"] = results["date"][:]
            final_segregated_results[2]["date"] = results["date"][:]
            entity_count_obj["date"] = len(results["date"])

        if "usdl" in results:
            usdl = results["usdl"][:]
            usdl_state = []
            for value in usdl:
                usdl_state.append(value.split(":")[0])

            # Adding this line of code, as we do not need states in GDPR
            usdl_value = [v.split(":")[1] for v in usdl]

            # Removing this line of code for now, as we don't need it for GDPR
            # final_results['usdl_state'] = usdl_state

            final_results["usdl"] = usdl_value
            final_segregated_results[2]["usdl"] = usdl_value

            entity_count_obj["usdl"] = len(usdl)

        LOGGER_Generic.debug(
            "Count of entities extracted for contentid {} are as follows: {}".format(
                contentId, entity_count_obj
            ),
            func_str,
        )
        if "ErrorCode" not in final_results:
            final_results["ErrorCode"] = CA_ERROR_CODES["success"]
            final_results["ErrorMessage"] = None
        final_segregated_results["ErrorCode"] = final_results["ErrorCode"]
        final_segregated_results["ErrorMessage"] = final_results["ErrorMessage"]

        params["perfCounter"].stop_stopwatch("RERPostProcess")
        params["perfCounter"].stop_stopwatch("TotalRER")
        if childDocs:
            return final_segregated_results
        else:
            return final_results
    except CvEETimeout.Timeout.ProcessingTimedOut as e:
        raise
    except Exception as e:
        LOGGER_Generic.exception("Error while performing RER", func_str)
        return {
            "ErrorCode": CA_ERROR_CODES["RERError"],
            "ErrorMessage": "Pattern recognition failure: {}".format(e),
        }


def main_client(commandQ=None, conf=None, key=1, parent_pid=0, entities_attributes={}):
    module_name = "CvEEClient"
    func_name = ""
    func_str = "{}::{}() - ".format(module_name, func_name)

    global STOMP_PORT, LOGGER_Client, LOW_PERF, PRE_PROCESS_TEXT
    global ROTATING_BACKUP_COUNT, ROTATING_MAX_BYTES, EXTRACT_TIMEOUT, NO_DOCS_TO_PROCESS, USER_DEFINED_PROXIMITY_CONF
    global MAX_THREADS, CHECK_ENTITIES, HARD_DEBUG, PROCESS_IDX, USE_RE2, DEEP_VALIDATION, CHECK_FAILURES, PHONE_ENTITY_LENIENCY, EXTRACT_ALL_DATE_FORMATS
    global ENTITIES_REGEX, ENTITIES_KEYS, ENTITIES_KEYWORDS

    PROCESS_IDX = key
    stompPort = 61613
    extra_config_params = {}

    if conf is not None:
        MAX_THREADS = conf["max_threads"]["value"]
        EXTRACT_TIMEOUT = conf["extract_timeout"]["value"]
        CHECK_ENTITIES = conf["check_entities"]["value"]
        CHECK_FAILURES = conf["check_failures"]["value"]
        STOMP_PORT = conf["stompPort"]["value"]
        ROTATING_MAX_BYTES = conf["log_max_bytes"]["value"]
        ROTATING_BACKUP_COUNT = conf["log_backup_count"]["value"]
        USE_RE2 = conf["use_re2"]["value"]
        DEEP_VALIDATION = conf["deep_validate"]["value"]
        PHONE_ENTITY_LENIENCY = conf["phone_entity_leniency"]["value"]
        EXTRACT_ALL_DATE_FORMATS = conf["extract_all_date_formats"]["value"]
        NO_DOCS_TO_PROCESS = conf["docs_per_batch"]["value"]
        PRE_PROCESS_TEXT = conf["pre_process_text"]["value"]
        LOW_PERF = conf["low_perf"]["value"]
        USER_DEFINED_PROXIMITY_CONF = ujson.loads(conf["proximity_conf"]["value"])

    try:
        logger_options = {
            "ROTATING_BACKUP_COUNT": ROTATING_BACKUP_COUNT,
            "ROTATING_MAX_BYTES": ROTATING_MAX_BYTES,
        }

        LOGGER_Client = get_logger_handler(
            os.path.join(getBaseDir(), HELPER_DLL), "ContentAnalyzer", logger_options
        )

    except Exception as e:
        print("Error while initialising: {}".format(e))

    checkParentAndKill(parent_pid, PROCESS_ID)
    USE_RE2 = checkRe2(USE_RE2)

    ENTITIES_KEYS = entities_attributes["entities_keys"]
    ENTITIES_REGEX = entities_attributes["entities_regex"]
    ENTITIES_KEYWORDS = entities_attributes["entities_keywords"]

    try:
        config = getConfiguration()
        config["perfCounter"] = PerformanceCounter(LOGGER_Client)
        config["perfCounter"].periodicLogTimer()
        eeHandler = EEHandler(config)
        eeClient = EEClient(config, eeHandler.processMessage)
        eeHandler.setUpdateFunction(eeClient.updateStatus)
        cmdManager = CommandManager(commandQ, eeClient.stop, eeHandler.stop)
        try:
            cmdManager.start()
            eeClient.run()
        except KeyboardInterrupt:
            eeClient.stop()

    except Exception as e:
        LOGGER_Client.exception(
            "There was an error while initializing clients: {}".format(e), func_str
        )


if __name__ == "__main__":
    main_client()
