当前位置:   article > 正文

js 含标签的文本对比_文本比对差异js

文本比对差异js
/**
 * 文本差异化比对工具,用于比对两段文本之间所存在的差异;
 * 对HTML文本的比对中,会完全忽略标签内的差异,将<...>
 * 中的内容当做一个整体的单词;
 * 注1:现有版本还未对执行效率进行优化;
 * 注2:原始算法来自于网络:http://www.rohland.co.za/index.php/2009/10/31/csharp-html-diff-algorithm/
 * 注3:效率真低啊..........╮(╯▽╰)╭
 * @author zhoujq | http://weibo.com/buxiaodewa
 */
(function() {
    var root = self;
    /**
     * 匹配描述块,一个用来表示相同内容块在新旧文档内位置的描述对象
     * @param {Number} startInOld [相同部分在旧文档中的起始位置]
     * @param {Number} startInNew [相同部分在新文档中的起始位置]
     * @param {Number} size       [相同部分的连续词元个数]
     */
    var Match = function(startInOld, startInNew, size) {
            this.size = size;
            this.startInOld = startInOld;
            this.startInNew = startInNew;
            this.endInOld = this.startInOld + this.size;
            this.endInNew = this.startInNew + this.size;
        };

    /**
     * 操作单元,一个操作单元描述了在具体位置所要
     * 执行的具体操作(如:insert、delete、equal与replace)
     * @param {Function} action
     * @param {Number}   startInOld [相同部分在旧文档中的起始位置]
     * @param {Number}   endInOld   [相同部分在旧文档中的结束位置]
     * @param {Number}   startInNew [相同部分在新文档中的起始位置]
     * @param {Number}   endInNew   [相同部分在新文档中的结束位置]
     */
    var Operation = function(action, startInOld, endInOld, startInNew, endInNew) {
            this.action = action;
            this.endInOld = endInOld;
            this.endInNew = endInNew;
            this.startInOld = startInOld;
            this.startInNew = startInNew;
        };

    /**
     * 将HTML解析成单词列表
     * 例:<a> Hello World </a>  ---> ["<a>"," ", "Hello", " ", "World", " ", "</a>"]
     * @param  {String}         html
     * @return {Array<String>}
     */
    var convertHtml2Words = function(html) {
            return html.match(/<[^>]+>|[^<|>|\w]|\w+\b|\s+/mg);
        };

    /**
     * 差异对比构建类
     * @param {String} oldVersion 老文档
     * @param {String} newVersion 新文档
     */
    var DiffBuilder = function(oldVersion, newVersion) {
            this.oldVersion = oldVersion;
            this.newVersion = newVersion;
            this.operation = null;
            this.wordIndices = {};
            this.oldWords = [];
            this.newWords = [];
            this.content = [];
        };

    DiffBuilder.prototype = {

        build: function() {
            var sd = new Date();
            this.splitInputs2Words();
            this.indexNewWords();
            this.operations = this.getOperations();
            this.performOperation();
            return "<br/>[耗时:"+(new Date - sd)+"毫秒]"+this.content.join('');
        },

        performOperation: function() {
            var opt;
            for (var i = 0, len = this.operations.length; i < len; i++) {
                opt = this.operations[i];
                this.operation = opt;
                opt.action.call(this, opt);
            }
        },

        /**
         * 将传入的文本切割成词元
         */
        splitInputs2Words: function() {
            this.oldWords = convertHtml2Words(this.oldVersion);
            this.newWords = convertHtml2Words(this.newVersion);
        },

        /**
         * 构建一个newWords队列对应的索引表
         * 例如:
         * this.newWords = ["<a>", " ", "Hello", " ", "World", " ", "</a>"];
         *
         * 构建成--->
         *
         * this.wordIndices = {
         *     "<a>"   : [0],
         *     " "     : [1, 3, 5]
         *     "Hello" : [2],
         *     "World" : [4],
         *     "</a>"  : [6]
         * };
         */
        indexNewWords: function() {
            var newWords = this.newWords;
            var wordIndices = this.wordIndices;
            var key, len = newWords.length;
            var toString = Object.prototype.toString;
            for (var i = 0; i < len; i++) {
                key = newWords[i];
                if (toString.call(wordIndices[key]) !== '[object Array]') {
                    wordIndices[key] = [];
                }
                wordIndices[key].push(i);
            }
        },
        /**
         * 将文档抽象为操作描述队列
         * 例如:[euqal, insert, equal],一篇文档可以被抽象为由多个
         * “操作命令与位置信息”组成的操作队列;
         * @return {Array<Operation>}
         */
        getOperations: function() {
            var optInOld = 0;
            var optInNew = 0;
            var operations = [];
            var match, action;
            var matchStartsInOld;
            var matchStartsInNew;
            var matchs = this.matchingBlocks();
            matchs.push(new Match(this.oldWords.length, this.newWords.length, 0));
            for (var i = 0, len = matchs.length; i < len; i++) {
                action = null;
                match = matchs[i];
                matchStartsInOld = (optInOld === match.startInOld);
                matchStartsInNew = (optInNew === match.startInNew);
                if (!matchStartsInOld && !matchStartsInNew) {
                    action = this.replace;
                } else if (matchStartsInOld && !matchStartsInNew) {
                    action = this.insert;
                } else if (!matchStartsInOld && matchStartsInNew) {
                    action = this.del;
                }
                if (action) {
                    operations.push(new Operation(action, optInOld, match.startInOld, optInNew, match.startInNew));
                }
                if (match.size) {
                    operations.push(new Operation(this.equal, match.startInOld, match.endInOld, match.startInNew, match.endInNew));
                }
                optInOld = match.endInOld;
                optInNew = match.endInNew;
            }
            return operations;
        },

        /**
         * 获取一个用于描述新旧文档内全部相同内容的匹配描述列表
         * @return {Array<Match>} 文档匹配描述列表
         */
        matchingBlocks: function() {
            var matchingBlocks = [];
            this.findMatchingBlocks(0, this.oldWords.length, 0, this.newWords.length, matchingBlocks);
            return matchingBlocks;
        },

        /**
         * 递归查找匹配项
         * @param  {Number} startInOld
         * @param  {Number} endInOld
         * @param  {Number} startInNew
         * @param  {Number} endInNew
         * @param  {Array<Match>} matchingBlocks
         */
        findMatchingBlocks: function(startInOld, endInOld, startInNew, endInNew, matchingBlocks) {
            var match = this.findMatch(startInOld, endInOld, startInNew, endInNew);
            if (!match) {
                return;
            }
            if (startInOld < match.startInOld && startInNew < match.startInNew) {
                this.findMatchingBlocks(startInOld, match.startInOld, startInNew, match.startInNew, matchingBlocks);
            }
            matchingBlocks.push(match);
            if (match.endInOld < endInOld && match.endInNew < endInNew) {
                this.findMatchingBlocks(match.endInOld, endInOld, match.endInNew, endInNew, matchingBlocks);
            }
        },

        /**
         * 从指定位置开始查询第一块匹配的文本块
         * @param  {Number} startInOld
         * @param  {Number} endInOld
         * @param  {Number} startInNew
         * @param  {Number} endInNew
         */
        findMatch: function(startInOld, endInOld, startInNew, endInNew) {
            var bestMatchInOld = startInOld;
            var bestMatchInNew = startInNew;
            var bestMatchSize = 0;
            var matchLengthAt = {};
            var newMatchLengthAt;
            var newMatchLength, wordIndexList, matchIndex, len;
            for (var idxOld = startInOld; idxOld < endInOld; idxOld++) {
                newMatchLengthAt = {};
                wordIndexList = this.wordIndices[this.oldWords[idxOld]];
                len = wordIndexList ? wordIndexList.length : 0;
                for (var i = 0; i < len; i++) {
                    matchIndex = wordIndexList[i];
                    if (matchIndex < startInNew) {
                        continue;
                    }
                    if (matchIndex >= endInNew) {
                        break;
                    }
                    newMatchLength = (matchLengthAt[matchIndex - 1] || 0) + 1;
                    newMatchLengthAt[matchIndex] = newMatchLength;
                    if (newMatchLength > bestMatchSize) {
                        bestMatchInOld = idxOld - newMatchLength + 1;
                        bestMatchInNew = matchIndex - newMatchLength + 1;
                        bestMatchSize = newMatchLength;
                    }
                }
                matchLengthAt = newMatchLengthAt;
            }
            return bestMatchSize ? new Match(bestMatchInOld, bestMatchInNew, bestMatchSize) : null;
        },

        insert: function(opt, tagCls) {
            this.insertTag('ins', tagCls || 'diffins', this.newWords.slice(opt.startInNew, opt.endInNew));
        },

        del: function(opt, tagCls) {
            this.insertTag('del', tagCls || 'diffdel', this.oldWords.slice(opt.startInOld, opt.endInOld));
        },

        equal: function(opt) {
            this.content = this.content.concat(this.newWords.slice(opt.startInNew, opt.endInNew));
        },

        replace: function(opt) {
            this.del(opt, 'diffmod');
            this.insert(opt, 'diffmod');
        },
        /**
         * 添加标签
         * @param  {String} tagName
         * @param  {String} cssCls
         * @param  {String} words
         */
        insertTag: function(tagName, cssCls, words) {
            var nonTags;
            var that = this;
            while (words.length) {
                //获取words内“连续”的非标签字符
                nonTags = this.extractConsecutiveWords(words, true);
                if (nonTags.length) {
                    this.content.push(
                        this.warpText(nonTags.join(''), tagName, cssCls));
                }
                if (words.length) {
                    //获取words内“连续”的标签字符
                    this.content = this.content.concat(
                        this.extractConsecutiveWords(words, false));
                }
            }
        },

        /**
         * 获取words内连续的“文本”或“标签”
         * @param  {Array<String>} words
         * @param  {Boolean} isTag
         * @return {Array<String>}
         */
        extractConsecutiveWords: function(words, isTag) {
            var idxOfFirstTag = null;
            for (var i = 0, len = words.length; i < len; i++) {
                //注:是取判定条件的前一位
                if (this.isTag(words[i]) == isTag) {
                    idxOfFirstTag = i;
                    break;
                }
            }
            return words.splice(0, idxOfFirstTag !== null ? idxOfFirstTag : words.length);
        },

        warpText: function(text, tagName, cssCls) {
            var tagStyle=tagName=='ins'?'background-color: #cfc;text-decoration: none;':'background-color: #FEC8C8;';
            return '<' + tagName +' style="' + tagStyle + '"'+ ' class="' + cssCls + '">' + text + '</' + tagName + '>';
        },

        isOpeningTag: function(item) {
            return /^\s*<[^>]+>\s*$/ig.test(item);
        },

        isClosingTag: function(item) {
            return /^\s*<\/[^>]+>\s*$/ig.test(item);
        },

        isTag: function(item) {
            return this.isOpeningTag(item) ? 1 : this.isClosingTag(item) ? 2 : 0;
        }
    };

    root.getHTMLDiff = function(oldVersion, newVersion) {
        return new DiffBuilder(oldVersion, newVersion).build();
    };

    /**
     * 针对高级浏览器开启webWorker支持
     */
    if(typeof postMessage === "function"){
        onmessage = function (evt) {
            var data = evt.data;
            var diff = getHTMLDiff(data.oldVersion, data.newVersion);
            postMessage(diff);
        };
    }
})();
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17
  • 18
  • 19
  • 20
  • 21
  • 22
  • 23
  • 24
  • 25
  • 26
  • 27
  • 28
  • 29
  • 30
  • 31
  • 32
  • 33
  • 34
  • 35
  • 36
  • 37
  • 38
  • 39
  • 40
  • 41
  • 42
  • 43
  • 44
  • 45
  • 46
  • 47
  • 48
  • 49
  • 50
  • 51
  • 52
  • 53
  • 54
  • 55
  • 56
  • 57
  • 58
  • 59
  • 60
  • 61
  • 62
  • 63
  • 64
  • 65
  • 66
  • 67
  • 68
  • 69
  • 70
  • 71
  • 72
  • 73
  • 74
  • 75
  • 76
  • 77
  • 78
  • 79
  • 80
  • 81
  • 82
  • 83
  • 84
  • 85
  • 86
  • 87
  • 88
  • 89
  • 90
  • 91
  • 92
  • 93
  • 94
  • 95
  • 96
  • 97
  • 98
  • 99
  • 100
  • 101
  • 102
  • 103
  • 104
  • 105
  • 106
  • 107
  • 108
  • 109
  • 110
  • 111
  • 112
  • 113
  • 114
  • 115
  • 116
  • 117
  • 118
  • 119
  • 120
  • 121
  • 122
  • 123
  • 124
  • 125
  • 126
  • 127
  • 128
  • 129
  • 130
  • 131
  • 132
  • 133
  • 134
  • 135
  • 136
  • 137
  • 138
  • 139
  • 140
  • 141
  • 142
  • 143
  • 144
  • 145
  • 146
  • 147
  • 148
  • 149
  • 150
  • 151
  • 152
  • 153
  • 154
  • 155
  • 156
  • 157
  • 158
  • 159
  • 160
  • 161
  • 162
  • 163
  • 164
  • 165
  • 166
  • 167
  • 168
  • 169
  • 170
  • 171
  • 172
  • 173
  • 174
  • 175
  • 176
  • 177
  • 178
  • 179
  • 180
  • 181
  • 182
  • 183
  • 184
  • 185
  • 186
  • 187
  • 188
  • 189
  • 190
  • 191
  • 192
  • 193
  • 194
  • 195
  • 196
  • 197
  • 198
  • 199
  • 200
  • 201
  • 202
  • 203
  • 204
  • 205
  • 206
  • 207
  • 208
  • 209
  • 210
  • 211
  • 212
  • 213
  • 214
  • 215
  • 216
  • 217
  • 218
  • 219
  • 220
  • 221
  • 222
  • 223
  • 224
  • 225
  • 226
  • 227
  • 228
  • 229
  • 230
  • 231
  • 232
  • 233
  • 234
  • 235
  • 236
  • 237
  • 238
  • 239
  • 240
  • 241
  • 242
  • 243
  • 244
  • 245
  • 246
  • 247
  • 248
  • 249
  • 250
  • 251
  • 252
  • 253
  • 254
  • 255
  • 256
  • 257
  • 258
  • 259
  • 260
  • 261
  • 262
  • 263
  • 264
  • 265
  • 266
  • 267
  • 268
  • 269
  • 270
  • 271
  • 272
  • 273
  • 274
  • 275
  • 276
  • 277
  • 278
  • 279
  • 280
  • 281
  • 282
  • 283
  • 284
  • 285
  • 286
  • 287
  • 288
  • 289
  • 290
  • 291
  • 292
  • 293
  • 294
  • 295
  • 296
  • 297
  • 298
  • 299
  • 300
  • 301
  • 302
  • 303
  • 304
  • 305
  • 306
  • 307
  • 308
  • 309
  • 310
  • 311
  • 312
  • 313
  • 314
  • 315
  • 316
  • 317
  • 318
  • 319
  • 320
  • 321
  • 322
  • 323
  • 324
声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/Li_阴宅/article/detail/796933
推荐阅读
相关标签
  

闽ICP备14008679号