import re, sys, ujson, traceback
from collections import defaultdict
from CvEEConfigHelper import (
    to_bytes,
    to_string,
    clean,
    removeLineBreaks,
    removeMultipleSpaces,
    removeHiddenText,
)
import base64

def findSpans(text, matches):
    start_index = 0
    for match, num_matches in list(matches.items()):
        matches[match] = []
        start_index = 0
        for _ in range(len(num_matches)):
            start = text.find(match, start_index)
            end = start + len(match)
            matches[match].append((start, end))
            start_index = end + 1
    return matches

def getMatches(regex, test_string):
    matches = defaultdict(list)
    for match in regex.finditer(test_string):
        group_idx = 0
        try:
            out = match.group(1)
            group_idx = 1
        except IndexError:
            group_idx = 0
        result_string = match.group(group_idx)   
        result_span = match.span(group_idx)     
        matches[result_string].append(result_span)
    return matches


def getTaggedString(test_string, matches, custom_open_tag, custom_close_tag):
    test_string_array = list(test_string)
    for match, positions in list(matches.items()):
        for position in positions:
            test_string_array[position[0]] = custom_open_tag + test_string_array[position[0]]
            test_string_array[position[1] - 1] += custom_close_tag
    tagged_string = "".join(test_string_array)
    return tagged_string


def getFormattedObject(matches, tagged_string):
    result_object = {}

    result_object["HighlightedText"] = tagged_string
    result_object["Matches"] = []

    for match, positions in list(matches.items()):
        entity_object = {}
        entity_object["Entity"] = match
        entity_object["Position"] = []

        for position in positions:
            position_object = {}
            position_object["startIndex"] = position[0]
            position_object["endIndex"] = position[1] - 1
            entity_object["Position"].append(position_object)

        result_object["Matches"].append(entity_object)

    return result_object


if __name__ == "__main__":
    try:
        cl_args = sys.argv

        custom_open_tag = "<mark>"
        custom_close_tag = "</mark>"

        if len(cl_args) >= 3:
            if len(cl_args) >= 4:
                isDecoded = cl_args[3]
                if bool(isDecoded):
                    if isDecoded.lower() == "true":
                        regex_string = to_string(base64.decodebytes(to_bytes(cl_args[1])))
                        test_string = to_string(base64.decodebytes(to_bytes(cl_args[2])))
                    else:
                        regex_string = cl_args[1]
                        test_string = cl_args[2]
                    if len(cl_args) >= 6:
                        custom_open_tag = cl_args[4]
                        custom_close_tag = cl_args[5]
            else:
                regex_string = cl_args[1]
                test_string = cl_args[2]
                if len(cl_args) >= 5:
                    custom_open_tag = cl_args[3]
                    custom_close_tag = cl_args[4]

            regex = re.compile(regex_string, re.DOTALL | re.IGNORECASE | re.MULTILINE)

            # pre process test_string similar to what we do for ee to get the consistent behavior
            clean_input = clean(
                test_string,
                removeLineBreaks,
                # removeTabs, # use case is covered in below method
                removeMultipleSpaces,
                removeHiddenText,
            )
            matches = getMatches(regex, clean_input)
            matches = findSpans(test_string, matches)
            tagged_string = getTaggedString(test_string, matches, custom_open_tag, custom_close_tag)

            result_object = getFormattedObject(matches, tagged_string)

            print(ujson.dumps(result_object))

        else:
            print(
                """Incorrect Usage\nTry python testRegex.py <regex_string> <test_string>\neg. $ python testRegex.py (?:abc) abcdef\n$ python testRegex.py (?:abc) abcdef "<b>" "</b"""
            )

    except Exception as e:
        error_object = {
            "ErrorMessage": "Failed to validate regular expression. Reason: {} ".format(e)
        }
        print(ujson.dumps(error_object))
