You need to sign in or sign up before continuing.
words_merge.py 1.25 KB
# -*- coding: utf-8 -*-
from __future__ import unicode_literals


class SimpleMerge(object):

    def __init__(self, doc, words):
        self.doc = doc
        self.words = words

    def merge(self):
        trans = {}
        for w in self.words:
            trans[w] = ''
        for w1 in self.words:
            cw = 0
            lw = len(w1)
            for i in range(len(self.doc)-lw+1):
                if w1 == self.doc[i: i+lw]:
                    cw += 1
            for w2 in self.words:
                cnt = 0
                l2 = len(w1)+len(w2)
                for i in range(len(self.doc)-l2+1):
                    if w1+w2 == self.doc[i: i+l2]:
                        cnt += 1
                if cw < cnt*2:
                    trans[w1] = w2
                    break
        ret = []
        for w in self.words:
            if w not in trans:
                continue
            s = ''
            now = trans[w]
            while now:
                s += now
                if now not in trans:
                    break
                tmp = trans[now]
                del trans[now]
                now = tmp
            trans[w] = s
        for w in self.words:
            if w in trans:
                ret.append(w+trans[w])
        return ret