import json

def getcharoffsetsfromwordoffsets(doc, entities):
    charoffsets = []
    for entity in entities:
        span = doc[entity[0] : entity[1]]
        charoffsetentitytuple = (span.start_char, span.end_char, entity[2])
        charoffsets.append(charoffsetentitytuple)
    return charoffsets


def convertspacyapitocliformat(nlp, TRAIN_DATA):
    from spacy.gold import biluo_tags_from_offsets
    docnum = 1
    documents = []
    for t in TRAIN_DATA:
        doc = nlp.make_doc(t[0])
        charoffsetstuple = getcharoffsetsfromwordoffsets(doc, t[1]["entities"])
        tags = biluo_tags_from_offsets(doc, charoffsetstuple)
        ner_info = list(zip(doc, tags))
        tokens = []
        sentences = []
        for n, i in enumerate(ner_info):
            token = {"head": 0, "dep": "", "tag": "", "orth": i[0].string, "ner": i[1], "id": n}
            tokens.append(token)
        sentences.append({"tokens": tokens})
        document = {}
        document["id"] = docnum
        docnum += 1
        document["paragraphs"] = []
        paragraph = {"raw": doc.text, "sentences": sentences}
        document["paragraphs"] = [paragraph]
        documents.append(document)
    return documents


def createtraintestdevsets(nlp, trainingexamples, filename, testsplit=20, devsplit=20):
    from sklearn.cross_validation import train_test_split

    if len(trainingexamples) == 0:
        return None

    trainidx, testidx = train_test_split(
        list(range(0, len(trainingexamples))), test_size=float(testsplit) / 100, random_state=42
    )
    testingset = [trainingexamples[i] for i in testidx]
    sentences = convertspacyapitocliformat(nlp, testingset)
    testingsetpath = filename + "_test.json"
    with open(testingsetpath, "w") as fp:
        json.dump(sentences, fp)

    trainingset = [trainingexamples[i] for i in trainidx]
    trainidx, devidx = train_test_split(
        list(range(0, len(trainingset))), test_size=float(devsplit) / 100, random_state=42
    )
    sentences = convertspacyapitocliformat(nlp, [trainingset[i] for i in trainidx])
    with open(filename + "_training.json", "w") as fp:
        json.dump(sentences, fp)
    sentences = convertspacyapitocliformat(nlp, [trainingset[i] for i in devidx])
    with open(filename + "_dev.json", "w") as fp:
        json.dump(sentences, fp)

    return testingsetpath


def util_get_dict_attr(obj, param, defaultValue=None):
    """ get obj['param'] if present otherwise return defaultValue
    """
    if obj is not None and param in obj:
        return obj[param]
    else:
        return defaultValue
