import os
import pickle
import re
import string
from operator import add
from pathlib import Path

import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer

import spacy
import ujson
from CvEEConfigHelper import (
    DOCUMENT_CLASSIFY_MODELS_MAP,
    DOCUMENT_CLASSIFY_MODELS_REG_KEY,
    getPythonSitePackagesDir,
    loadRegValue,
)
from CvEEUtil import util_get_dict_attr

DOCUMENT_CATEGORIES = ["resumes", "logs", "source_codes", "others"]


class ContentBasedCategorizer:
    def __init__(self, model_path, nlp):
        """
			load spacy text categorizer model which is trained to classify logs and resumes.
		"""
        try:
            self.classify = spacy.load(model_path, vocab=nlp.vocab)
        except Exception as e:
            print("Failed to load spacy text categorizer model. Exception {}".format(e))

    def get_max_prob_cat(self, cats):
        probs = [0.0] * len(DOCUMENT_CATEGORIES)
        for cat, prob in list(cats.items()):
            probs[DOCUMENT_CATEGORIES.index(cat)] = prob
        return probs

    def get_document_type(self, content):
        """
			get the predicted document category
		"""
        content = content.strip()
        content = " ".join(content.split(" ")[:10000])
        doc = self.classify(content)
        return self.get_max_prob_cat(doc.cats)


class StructureBasedCategorizer:
    def __init__(self, model_path):
        """
			load scikit learn Random Forest Classifier model pickle which is trained to classify logs,resumes and source_codes based on its structure.
		"""
        try:
            self.classify = pickle.load(open(Path(model_path), "rb"), encoding="latin-1")
        except Exception as e:
            print("Failed to load structured based text categorizer model. Exception {}".format(e))

    def get_features(self, text):
        features = {}
        content = text
        content = content.strip()
        features["total_chars"] = len(content)
        features["newlines"] = len(re.findall("\\n", content)) / float(features["total_chars"])
        features["words"] = len(re.split("\W+", content)) / float(features["total_chars"])
        features["spaces"] = len(re.findall("\s", content)) / float(features["total_chars"])
        features["tabs"] = len(re.findall("\\t", content)) / float(features["total_chars"])
        new_lines_count = len(re.findall("\\n", content))
        if new_lines_count == 0:
            new_lines_count = 1
        features["words_per_line"] = len(re.split("\W+", content)) / float(new_lines_count)
        features["average_spaces"] = features["spaces"] / float(new_lines_count)
        features["average_tabs"] = features["tabs"] / float(new_lines_count)
        punctuation_count = len(re.findall(re.escape(string.punctuation), content))
        features["average_punctuation"] = punctuation_count / float(new_lines_count)
        for punct in string.punctuation:
            features["punct_" + punct] = len(re.findall(re.escape(punct), content)) / float(
                new_lines_count
            )
        del features["total_chars"]
        return features

    def merge_dict(self, train_dict=None, feature_dict=None):
        if feature_dict is None:
            return
        for key, value in list(feature_dict.items()):
            if key not in train_dict:
                train_dict[key] = []
            train_dict[key].append(value)

    def get_max_prob_cat(self, probs):
        return probs.tolist()[0]

    def get_document_type(self, content):
        """
			get the predicted document categoryz
		"""
        train_dict = {}
        self.merge_dict(train_dict, self.get_features(content))
        test_df = pd.DataFrame.from_dict(train_dict)
        probs = self.classify.predict_proba(test_df)
        return self.get_max_prob_cat(probs)


class Categorizer:
    def __init__(self, models, nlp):
        PYTHON_SITE_PACKAGES_DIR = getPythonSitePackagesDir()
        """
			models = [{
				'model_type' : 'content_based',
				'model_library' : 'spacy',
				'model_path' : ''
			},{
				'model_type' : 'structure_based',
				'model_library' : 'sklearn'
				'model_path' : ''
			}
			]
		"""
        self.models = {}
        for model in models:
            if model["model_type"] == "content_based":
                self.models[model["model_type"]] = ContentBasedCategorizer(
                    os.path.join(PYTHON_SITE_PACKAGES_DIR, Path(model["model_path"])), nlp
                )
            elif model["model_type"] == "structure_based":
                self.models[model["model_type"]] = StructureBasedCategorizer(
                    os.path.join(PYTHON_SITE_PACKAGES_DIR, Path(model["model_path"]))
                )

    def get_document_type(self, text):
        """
			get the predicted document category
			for now just going with averaging approach among all the models
		"""
        num_category = len(DOCUMENT_CATEGORIES)
        num_models = len(list(self.models.keys()))
        probs = [0.0] * num_category
        for _, model in list(self.models.items()):
            cat_probs = model.get_document_type(text)
            probs = list(map(add, probs, cat_probs))
        probs = [x / num_models for x in probs]
        max_prob = max(probs)
        max_index = probs.index(max_prob)
        if max_prob < 0.5:
            return "others"
        else:
            return DOCUMENT_CATEGORIES[max_index]


class DocumentCategorizer(object):
    def __init__(self, nlp, params={}):
        self.LOGGER_Generic = params["logger"]
        self.perfCounter = util_get_dict_attr(params, "perfCounter")
        self.use_document_classifier = False
        if "use_document_classifier" in params and params["use_document_classifier"] == True:
            self.use_document_classifier = True
        # use document classifier if use_document_classifier is set as True
        if self.use_document_classifier == True:
            DOCUMENT_CLASSIFY_MODELS_DIR = loadRegValue(
                DOCUMENT_CLASSIFY_MODELS_REG_KEY, DOCUMENT_CLASSIFY_MODELS_MAP, type=str
            )
            try:
                models = ujson.loads(DOCUMENT_CLASSIFY_MODELS_DIR)
                self.classifier = Categorizer(models, nlp)
            except Exception as e:
                self.LOGGER_Generic.error(
                    "Failed to parse JSON from sEEDocumentClassifyModelsInstallDir registry key. Exception : {}".format(
                        e
                    )
                )

    def classify(self, content):
        category = "others"
        if self.use_document_classifier == True:
            category = self.classifier.get_document_type(content)
        return category


if __name__ == "__main__":
    models = [
        {
            "model_type": "content_based",
            "model_library": "spacy",
            "model_path": "document_classify_v1.1",
        },
        {
            "model_type": "structure_based",
            "model_library": "sklearn",
            "model_path": "structure_classify_v1.1\\structure_model.pkl",
        },
    ]
    # TODO: this will not work as we need to load large model first. Will update the unit test code later.
    # categorizer = Categorizer(models)
    # text = u'Example Text'
    # with open('file_path') as f:
    # 	text = f.read().decode('utf-8')
    # print(categorizer.get_document_type(text))
