from collections import Counter
from sklearn.cluster import MiniBatchKMeans
from sklearn.metrics.pairwise import cosine_similarity
from scipy.cluster.hierarchy import ward, to_tree
from scipy.sparse import csr_matrix
from operator import itemgetter as _itemgetter
import re
import os
import heapq
from datetime import datetime
from simhash import Simhash
import uuid
from concurrent.futures import ThreadPoolExecutor
import inspect
from cvee_stopwords import stopwords
import nltk

stopwords_set = set(stopwords)
wpt = nltk.WordPunctTokenizer()

FILE_COUNT = 20
POOL_SIZE = 20
MAX_LEVEL = 6
MAX_HIERARCHAL_SIZE = 5000
MIN_CLUSTER_SIZE = 2
MAX_KMEAN_COUNT = 8
MAX_SIM_GAP = 10
MAX_TFIDF_DIST = 2
SHORT_EMAIL_LEN = 3
CHILD_LEVEL = 4
COLS = 64
OUTPUT_HASH = False

GlobalHashNode = {}

clf = MiniBatchKMeans(n_clusters=MAX_KMEAN_COUNT, random_state=7)

class EmailCluster:
    
    def __init__(self, dataFile, csvToken, resFile, logger = None, mqClient = None, respQName = None):
        self.topFeatures = []
        self.featureDocs = {}
        self.nogroupEmails = []
        self.simHash = []
        
        self.hashNode = {}
        self.removedNode = set()
        
        self.mqClient = mqClient
        self.respQName = respQName
        self.logger = logger
        
        self.emails = []
        self.subjects = []
        self.contentids = []
        
        self.log("start")
        
        
        with open(dataFile) as email_file:
            for line in email_file:
                email = line.split(csvToken)
                self.contentids.append(email[0])
                self.subjects.append(email[1].rstrip().lower())
                self.emails.append(email[2].rstrip().lower())
                
        self.dataFolder = os.path.dirname(dataFile)
        
        self.log("list length " + str(len(self.emails)) + " before clean")
        
        self.processed = [False] * len(self.emails)
        
        index = 0
        normalEmails = []
        self.shortEmails = []
        
        for email in self.emails:
            email = get_clean_text(email.lower())
            emailFeature = get_features(email)
            if len(emailFeature) > SHORT_EMAIL_LEN:
                normalEmails.append(index)
            else:
                self.shortEmails.append(index)
            subjectFeature = get_features(self.subjects[index])
            features = Counter(subjectFeature).most_common(10)
            for feature in features:
                self.featureDocs[feature[0]] = self.featureDocs.get(feature[0], 0) + 1
            features = [(feature[0], float(feature[1])/float(len(subjectFeature))) for feature in features]    
            self.topFeatures.append(features)
    
            index += 1
        
        self.log("list length " + str(len(normalEmails)) + " after clean")
        rows = len(self.emails)
        
        self.arr = [([False] * COLS) for i in range(rows)]
        self.generateSimHash()
        
        self.log("hash matrix finish")
        
        self.hash_matrix = csr_matrix(self.arr)
        
        self.result = []
        self.doCluster([], normalEmails, self.result)
        
        self.text_res = open(resFile, 'w')
        self.json_res = None
        
        self.outputClusterResult()
        
        for i in range(len(self.processed)):
            if not self.processed[i]:
                #print("missing line " + str(i) + ' ' + self.emails[i])
                print("missing line " + str(i))
            
    
    def log(self, msg):
        if self.logger:
            frm = inspect.stack()[1]
            mod = inspect.getmodule(frm[0])
            funn = frm.function
            self.logger.info(mod.__name__ + ":" + funn + " " + msg)
        else:
            print(str(datetime.now()) + msg)
    
    def outputSubTree(self, levels, subTree):
        prefix = ''
        textStr = ""
        jsonStr = ""
            
        for x in range(len(levels)):
            prefix = prefix + '  '
            
        if isinstance(subTree, Tree):
            
            treeRes = self.generateTreeDataOutput(subTree, levels)
            textStr += treeRes[0]
            jsonStr += treeRes[1]
        else:
            
            #textStr += ('--level--' + prefix + str(levels))
            
            textStr += "  <indexItems>\n"
            textStr += "    <customProperties>\n"
            textStr += "      <nameValues name=\"CLUSTERID\" value=\""+getLevelsStr(levels)+"\"/>\n"
            
            jsonStr += (prefix + '{'+'\n')
            
            if isinstance(subTree, tuple):
                keyWords = self.getClusterKeyWords(subTree[0])
                self.sentMQClusterResp(levels, keyWords)
                jsonStr += (prefix + '  "name":"' + keyWords + '",\n')
                #textStr += (keyWords)
                #textStr += ('\n\n')
                textStr += "      <nameValues name=\"CLUSTERLABEL\" value=\""+keyWords+"\"/>\n"
                textStr += "    </customProperties>\n"
                textStr += "  </indexItems>\n"
                jsonStr += (prefix + '  "children":['+'\n')
                for index in range(len(subTree[1])):
                    levels.append(index + 1)
                    subRes = self.outputSubTree(levels, subTree[1][index])
                    levels.pop()
                    textStr += subRes[0]
                    jsonStr += subRes[1]
                    if index != len(subTree[1]) -1:
                        jsonStr += prefix + '  ,\n'
                jsonStr += (prefix + '  ]\n')
            else:
                keyWords = self.getClusterKeyWords(subTree)
                self.sentMQClusterResp(levels, keyWords)
                #textStr += (keyWords)
                #textStr += ('\n\n')
                textStr += "      <nameValues name=\"CLUSTERLABEL\" value=\""+keyWords+"\"/>\n"
                textStr += "    </customProperties>\n"
                textStr += "  </indexItems>\n"
                jsonStr += (prefix + '  "name":"' + keyWords + '",\n')
                
                leafRes = self.outputOneLeafCluster(levels, prefix, subTree)
                textStr += leafRes[0]
                jsonStr += leafRes[1]
                
            jsonStr += (prefix + '}'+'\n')
        return (textStr, jsonStr)

    def outputClusterResult(self):
        textStr = ""
        jsonStr = ""
        
        textStr += "<App_CVCentralIndexItemReq opType=\"\">\n"
        
        jsonStr += ('{\n')
        jsonStr += ('"name": "root",\n')
        jsonStr += ('"children": [\n')
        levels = []
        
        for index in range(len(self.result)):
            levels.append(index + 1)
            res = self.outputSubTree(levels, self.result[index])
            textStr = textStr + res[0]
            jsonStr = jsonStr + res[1]
            if index != len(self.result) -1:
                jsonStr += '  ,'
            levels.pop()
        
        res = self.generateNoClusterEmailPart (self.shortEmails, 'short email', -1)
        textStr += res[0]
        jsonStr += res[1]
            
        res = self.generateNoClusterEmailPart (self.nogroupEmails, 'no-group email', -2)
        textStr = textStr + res[0]
        jsonStr = jsonStr + res[1]
        
        textStr += "</App_CVCentralIndexItemReq>\n"
        
        jsonStr += (']\n')
        jsonStr += ('}\n')
        
        self.text_res.write(textStr)
        if (self.json_res): 
            self.json_res.write(jsonStr)
    
    def getTopFeatures(self, docList, n = 200):
        subjectSet = set()
        for doc in docList:
            subjectSet.add(tuple(get_features(self.subjects[doc])))
            if (len(subjectSet) > 1):
                break
        
        if (len(subjectSet) == 1):
            return list(subjectSet.pop())[:5]    
        
        res = [];
        fDict = {}
        
        for doc in docList:
            features = self.topFeatures[doc]
            for feature in features:
                fDict[feature[0]] = fDict.get(feature[0], 0) + feature[1]
            
        topn = heapq.nlargest(n, fDict.items(), key=_itemgetter(1))
        
        for feature in topn:
            res.append(feature[0])
        return res
    
    def getClusterKeyWords(self, data):
        features = self.getTopFeatures(data)
        keyWordCount = 0
        keyWords = ''
        for feature in features:
            if keyWordCount == 5:
                break
            if len(feature) > 20:
                continue
            keyWords += feature
            keyWords += ' '
            keyWordCount += 1
        
        if len(keyWords) == 0 and len(features) > 0:
            keyWords = features[0].strip()
        
        return keyWords.strip()
    
    def getGroupHashDif(self, node):
        leftNode = node.left
        rightNode = node.right
        
        if leftNode is None or rightNode is None:
            return 0;
        
        leftData = leftNode.data
        rightData = rightNode.data
        
        lefta = self.emails[leftData[0]].split(' ')
        righta = self.emails[rightData[0]].split(' ')
        sameCount = 0
        for i in range(min(len(lefta), len(righta))):
            if lefta[i] != righta[i]:
                break
            
            sameCount += 1
        
        if sameCount >= 3:
            return 0
        
        sameCount = 0
        for i in range(3):
            if (i > len(lefta) or i > len(righta)):
                break
            if lefta[len(lefta) - i - 1] != righta[len(righta) - i -1]:
                break
            
            sameCount += 1
        
        if sameCount >= 3:
            return 0
        
        gap = bin(self.simHash[leftData[0]]^self.simHash[rightData[0]]).count("1")
        
        return gap

    def getSimilarNode(self, doc):
        
        docHash = self.simHash[doc]
        node = self.hashNode.get(docHash)
        if node:
            return node
        
        findInGlobal = False
        if GlobalHashNode.get(docHash) != None:
            findInGlobal = True
        
        minGap = 64
        hashKey = 0;
        for existHash in self.hashNode.keys():
            gap = bin(docHash^existHash).count("1")
            if gap < minGap:
                minGap = gap
                hashKey = existHash
                
        if minGap <= MAX_SIM_GAP:
            return self.hashNode[hashKey]
        
        if not findInGlobal:
            minGap = 64
            hashKey = 0;
            for existHash in GlobalHashNode.keys():
                gap = bin(docHash^existHash).count("1")
                if gap < minGap:
                    minGap = gap
                    hashKey = existHash            
            if minGap <= MAX_SIM_GAP:
                findInGlobal = True
            
        if findInGlobal:
            print('findInGlobal')
        
        return None
    
    def removeOneNode(self, node):
        pNode = node.parent
        
        if not pNode:
            return
        
        if (pNode.left == node):
            if node.left:
                pNode.left = node.left
            elif node.right:
                pNode.left = node.right
            else:
                pNode.left = None
        else:
            if node.left:
                pNode.right = node.left
            elif node.right:
                pNode.right = node.right
            else:
                pNode.right = None
        
        self.removedNode.add(node)
    
    def processDiffSubNode(self, node):
        
        if node == None or node.left != None or node.right != None:
            return
    
        if len(node.data) == 1:
            self.nogroupEmails.extend(node.data)
            self.removeOneNode(node)
            return
        
        node.midHash = self.simHash[node.data[0]]
        self.hashNode[node.midHash] = node
        GlobalHashNode[node.midHash] = node
        '''
        existNode = self.getSimilarNode(node.data[0])
        if not existNode or existNode == node:
            self.hashNode[node.midHash] = node
            GlobalHashNode[node.midHash] = node
            return
    
        ppNode = node.parent.parent
        if not ppNode:
            return
        existNode.data.extend(node.data)
        self.removeOneNode(node)
        '''   
        
    def fillTreeData(self, tree):
        stack1 = []
        stack2 = []
        
        removedNode = set()
        
        node = tree
        stack1.append(node)
        
        while stack1:
            node = stack1.pop()
            if node.left:
                stack1.append(node.left)
            if node.right:
                stack1.append(node.right)
                
            stack2.append(node)
            
        while stack2:
            node = stack2.pop()
            
            if node.left and (not node.right):
                if (node.parent and node.left.left and node.left.right):
                    self.removeOneNode(node)
                    continue
                node.data.extend(node.left.data)
                node.left = None
            elif node.right and (not node.left):
                if (node.parent and node.right.left and node.right.right):
                    self.removeOneNode(node)
                    continue
                node.data.extend(node.right.data)
                node.right = None
            elif node.left and node.right:
                node.data.extend(node.left.data)
                node.data.extend(node.right.data)
                
            if (len(node.data) <= 1):
                self.nogroupEmails.extend(node.data)
                self.removeOneNode(node)
                continue
            
            if (len(node.data) <= MIN_CLUSTER_SIZE):
                node.right = None
                node.left = None
                
            if (not node.left and not node.right):
                node.midHash = self.simHash[node.data[0]]
                self.hashNode[node.midHash] = node
                GlobalHashNode[node.midHash] = node
                
    def mergeSimilarCluster(self, tree):
        stack1 = []
        stack2 = []
        
        node = tree
        stack1.append(node)
        
        while stack1:
            node = stack1.pop()
            if node.left:
                stack1.append(node.left)
            if node.right:
                stack1.append(node.right)
                
            stack2.append(node)
            
        while stack2:
            node = stack2.pop()
            
            if (node.left == None or node.right == None or node in self.removedNode or len(node.left.data) == 0 or len(node.right.data) == 0):
                continue
    
            hashDiff = self.getGroupHashDif(node)
                
            if (hashDiff <= MAX_SIM_GAP and node.left.left == None and node.left.right == None and node.right.left == None and node.right.right == None):
                node.data.extend(node.left.data)
                node.data.extend(node.right.data)
                node.left = None
                node.right = None
                
            elif node.parent:
                self.processDiffSubNode(node.left)
                self.processDiffSubNode(node.right)
                
    def generateNoClusterEmailPart(self, emilsList, label, sequence):
        
        textStr = ""
        jsonStr = ""
        
        textStr += "  <indexItems>\n"
        textStr += "    <customProperties>\n"
        textStr += "      <nameValues name=\"CLUSTERID\" value=\""+str(sequence)+"\"/>\n"
        textStr += "      <nameValues name=\"CLUSTERLABEL\" value=\""+label+"\"/>\n"
        textStr += "    </customProperties>\n"
        textStr += "  </indexItems>\n"
        
        prefix = '  '
        jsonStr += prefix + ','+'\n'
        jsonStr += prefix + '{'+'\n'
        
        
        nstr = prefix + '  "name":"'+label+'",'
        jsonStr += (nstr+'\n')
        
        levels = [sequence]
        
        leafRes = self.outputOneLeafCluster(levels, prefix, emilsList)
        textStr +=(leafRes[0])
        jsonStr +=(leafRes[1])
        
        jsonStr +=(prefix + '}'+'\n')
        
        return (textStr, jsonStr)
     
    def sentMQClusterResp(self, levels, keyWords):
        if not self.mqClient:
            return
        self.mqClient.send(self.respQName, ('cluster ' + str(levels) + ' ' + keyWords).encode())
     
    def generateTreeDataOutput(self, tree, levels):
        if tree is None or len(levels) > MAX_LEVEL:
            return
        
        prefix = ''
        for x in range(len(levels)):
            prefix = prefix + '  '
        
        textStr = ""
        jsonStr = ""
        
        #textStr += ('--level--' + prefix + str(levels))
        
        textStr += "  <indexItems>\n"
        textStr += "    <customProperties>\n"
        textStr += "      <nameValues name=\"CLUSTERID\" value=\""+getLevelsStr(levels)+"\"/>\n"
        
        jsonStr += (prefix + '{'+'\n')
        
        keyWords = ''
        data = tree.data
        children = getChildren(tree, len(levels))
        length = len(children)
        
        subJsonStr = ''
        subTxtStr = ''
        
        if (length < 2):
            keyWords = self.getClusterKeyWords(tree.data)
            self.sentMQClusterResp(levels, keyWords)
            lefeRes = self.outputOneLeafCluster(levels, prefix, data)
            subTxtStr += lefeRes[0]
            subJsonStr += lefeRes[1]
        else:
            
            subJsonStr += (prefix + '  "children":['+'\n')
            
            keyWordsList = []
            
            for i in range(length):
                levels.append(i + 1)
                subRes = self.generateTreeDataOutput(children[i] , levels)
                subTxtStr += subRes[0]
                subJsonStr += subRes[1]
                if i != length - 1:
                    subJsonStr += prefix + '  ,\n'
                keyWordsList.append(subRes[2])
                levels.pop()
                
            keyWords = self.getKeyworksFromSubTrees(children, keyWordsList)
            self.sentMQClusterResp(levels, keyWords)
            
            subJsonStr += (prefix + '  ]' +'\n')
            
        #textStr += keyWords
        #textStr += '\n\n'
        textStr += "      <nameValues name=\"CLUSTERLABEL\" value=\""+keyWords+"\"/>\n"
        textStr += "    </customProperties>\n"
        textStr += "  </indexItems>\n"
        
        jsonStr += (prefix + '  "name":"' + keyWords + '",\n')
        
        textStr += subTxtStr
        jsonStr += subJsonStr
            
        
        jsonStr += (prefix + '}\n')
        
        return (textStr, jsonStr, keyWords)        
    
    def outputOneLeafCluster(self, levels, prefix, docList):
        textStr = ""
        jsonStr = ""
    
    
        jsonStr += (prefix + '  "value":' + str(len(docList))+',\n')
        jsonStr += (prefix + '  "data": [')
        #textStr += (prefix + '            leaf_cluster '+str(len(docList))+' docs\n')
        #leafTextList = []
        
        cidFullStr = ''
        cidStr = ''
        for level in levels:
            if level > 0 and level < 10:
                cidStr += '0'
            cidStr += str(level) + '.'
            cidFullStr += "      <nameValues name=\"CLUSTERID\" value=\""+cidStr[:len(cidStr) - 1].replace('[', '').replace(']', '')+"\"/>\n"
        
        
        mqRespStr = 'doclabel ' + str(levels) + ' '
        for docIndex in range(len(docList)):
            doc = docList[docIndex]
            emailstr = self.emails[doc]
            if (self.processed[doc]):
                print('doc [' + str(doc) + '] access more than once' )
            
            self.processed[doc] = True
            nstrlen = len(emailstr)
            emailstr = emailstr[:500 if nstrlen > 500 else nstrlen]
            if (not emailstr.endswith('\n')):
                emailstr += "\n"
            origIndex = str(doc + 1)
            '''
            textStr += ('            ')
            textStr += (prefix)
            textStr += ('a1s2 [')

            digLen = len(str(len(self.emails)))
            for i in range(digLen - len(origIndex)):
                textStr += (' ')
            textStr += (origIndex)
            textStr += (']')
            textStr += (emailstr)
            '''
            
            textStr += "  <indexItems>\n"
            textStr += "    <customProperties>\n"
            textStr += "      <nameValues name=\"CONTENTID\" value=\""+self.contentids[doc]+"\"/>\n"
            textStr += cidFullStr
            textStr += "    </customProperties>\n"
            textStr += "  </indexItems>\n"            
            
            
            jsonStr += (origIndex)
            if docIndex != len(docList) - 1:
                jsonStr += (', ')
            mqRespStr += 'contentid_' + origIndex + ' '
        #textStr += ('\n')
        textStr += ('\n')
        jsonStr += (']\n')
        
        if self.mqClient:
            self.mqClient.send(self.respQName, mqRespStr.encode())
        
        return (textStr, jsonStr)
        
    
    def kmeansCluster(self, levels, curList, result):
        kmlabels = clf.fit_predict(self.hash_matrix[curList])
        doclabel = {}
        for i in range(len(kmlabels)):
            l = doclabel.get(kmlabels[i], []);
            l.append(curList[i])
            doclabel[kmlabels[i]] = l
        doclabel = sorted(doclabel.items(), key=lambda docPair:len(docPair[1]))
        
        loopCount = len(doclabel)
                
        for index in range(loopCount):
            levels.append(' ')
            docList = doclabel[index][1]
            
            node = Tree()
            node.data = docList     
            
            listLen = len(docList)
            isLeaf = False;
            sameCluster = False;
            
            isLeaf = (len(docList)) <= MIN_CLUSTER_SIZE or listLen == len(curList) or (sameCluster) or len(levels) >= MAX_LEVEL
            
            if isLeaf:
                result.append(docList)
            else:
                subRes = []
                #result.append((docList, subRes))
                self.doCluster(levels, docList, subRes)
                if len(subRes) == 1:
                    result.append(subRes[0])
                else:
                    result.append((docList, subRes))
            
            levels.pop()
        
    def hierarchalCluster(self, levels, curList, result):

        self.removedNode = set()

        
        dist = 1 - cosine_similarity(self.hash_matrix[curList])
        linkage_matrix = ward(dist)
        tree = to_tree(linkage_matrix)
        nTree = Tree()
        convertTree(tree, nTree, curList)
        self.mergeSimilarCluster(nTree)
        self.fillTreeData(nTree)
        result.append(nTree)
    
    def doCluster(self, levels, curList, result):
        if (len(curList) == 0):
            return ('', '', [''])
        if (len(curList) > MAX_HIERARCHAL_SIZE):
            self.log("processing kmeans for " + str(len(curList)) + " docs")
            return self.kmeansCluster(levels, curList, result)
        else:
            self.log("processing hierarchal for " + str(len(curList)) + " docs")
            return self.hierarchalCluster(levels, curList, result)   
        
    def getKeyworksFromSubTrees(self, children, keyWordsList):
    
        edata = []
        
        for i in range(len(children)):
            edata.extend(children[i].data)
        return self.getClusterKeyWords(edata)
            
    
    def generateSimHash(self):
        prefix = str(uuid.uuid4())
        sizePreFile = int(float(len(self.emails) / FILE_COUNT) + 1)
        fileCount = 0;
        dataFile = None
        hashFile = None
        dataFileName = ''
        hashFileName = ''
        for i in range(len(self.emails)):
            if i%sizePreFile == 0:
                if dataFile is not None:
                    dataFile.close()
                dataFileName = os.path.join(self.dataFolder, prefix+'_'+str(fileCount)+'_sub.bin')
                hashFileName = os.path.join(self.dataFolder, prefix+'_'+str(fileCount)+'_hash.bin')
                dataFile = open(dataFileName,'w')
                fileCount += 1;
            dataFile.write(self.emails[i])
            dataFile.write('\n')
        if dataFile is not None:
            dataFile.close();
            
        with ThreadPoolExecutor(max_workers=POOL_SIZE) as executor:
            for i in range(fileCount + 1):
                dataFileName = os.path.join(self.dataFolder, prefix+'_'+str(i)+'_sub.bin')
                hashFileName = os.path.join(self.dataFolder, prefix+'_'+str(i)+'_hash.bin')
                executor.submit(generateSimHashForEachFile, dataFileName, hashFileName)
            
            executor.shutdown()
        
        index = 0
        for i in range(fileCount):
            dataFileName = os.path.join(self.dataFolder, prefix+'_'+str(i)+'_sub.bin')
            hashFileName = os.path.join(self.dataFolder, prefix+'_'+str(i)+'_hash.bin')
            with open(hashFileName,'r') as hashFile:
                line = hashFile.readline()
                line = line.rstrip()
                while line:
                    if len(line) == 0:
                        line = hashFile.readline()
                        line = line.rstrip()
                        continue
                    simHashVal = 0
                    simHashVal = int(line)
                    self.simHash.append(simHashVal)
                    binstr = '{0:064b}'.format(simHashVal)
                    for j in range( COLS):
                        if binstr[j] == '1':
                            self.arr[index][j] = True
                    index += 1
                    line = hashFile.readline()
                    line = line.rstrip()
                hashFile.close()
            os.remove(dataFileName)
            os.remove(hashFileName)
def getLevelsStr(levels):
    levStr = ""
    for level in levels:
        levStr += (("0" + str(level)) if level < 10 else str(level))
        levStr += "."
        
    levStr = levStr[:len(levStr) - 1] 
         
    
    return levStr

def get_features(s):
    s = re.sub(r'[^\w ]+', ' ', s)
    res = []
    for word in s.split():
        if len(word) > 2 and word not in stopwords_set and not word.isdigit():
            res.append(word)
    return res

def truncate_words(text, size):
    tokens = wpt.tokenize(text)
    del tokens[size:]
    text = " ".join(tokens)
    return text

def remove_stopwords(text):
    tokens = wpt.tokenize(text)
    text = " ".join([word for word in tokens if (word not in stopwords_set and not word.isdigit())])
    return text

def get_clean_text(text, tokens=False):
    text = re.sub('\[([^\[\]]*)\]', '', text)
    text = remove_stopwords(text)
    text = truncate_words(text, 20)
    
    if tokens:
        return text.split()
    else:
        return text 

def generateSimHashForEachFile(inF, outF):
    with open(inF,'r') as dateFile:
        with open(outF,'w') as hashFile:
            line = dateFile.readline()
            while line:
                docHash = Simhash(get_features(line)).value
                hashFile.write(str(docHash)+'\n')
                line = dateFile.readline()

class Tree(object):
    def __init__(self, parent = None):
        self.left = None
        self.right = None
        self.data = []
        self.parent = parent
        self.midHash = 0
        self.dist = 0

def convertTree(tree1, tree2, docList):
    stack1 = []
    
    node = (tree1, tree2)
    
    stack1.append(node)
    while stack1:
        node = stack1.pop()
        if node[0].left:
            node[1].left = Tree(node[1])
            stack1.append((node[0].left, node[1].left))
        if node[0].right:
            node[1].right = Tree(node[1])
            stack1.append((node[0].right, node[1].right))
        
        if node[0].id < len(docList):
            node[1].data.append(docList[node[0].id])
            
        node[1].dist = node[0].dist
        
def getSubChildren(tree, res, deep):
    if len(tree.data) == 0:
        return
    if deep == 0 or len(tree.data) <= 2 or (tree.left is None and tree.right is None):
        res.append(tree)
        return
    
    if tree.left is not None:
        getSubChildren(tree.left, res, deep - 1)
        
    if tree.right is not None:
        getSubChildren(tree.right, res, deep - 1)


def getChildren(tree, level = 0):
    res = []
    if level > MAX_LEVEL - 1 or len(tree.data) <= MIN_CLUSTER_SIZE:
        return res
    
    getSubChildren(tree, res, CHILD_LEVEL)
    
    return res

if __name__ == '__main__':    
    ec1 = EmailCluster('d:/temp/e.csv', 'bb370242ac130002', 'd:/temp/res.xml')
