-
Notifications
You must be signed in to change notification settings - Fork 17
/
Copy pathlexrank.js
138 lines (109 loc) · 3.32 KB
/
lexrank.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
(function () {
'use strict';
var unfluff = require('unfluff');
var request = require('request');
var Tokenizer = require('sentence-tokenizer');
var natural = require('natural');
var wuzzy = require('wuzzy');
function summarizePage(url, lines, callback) {
request(url, function (err, resp, body) {
if (err || resp.statusCode != 200) {
return callback(err);
}
var text = unfluff(body).text;
if (!text) {
return callback(new Error('no text to summarize'));
}
summarize(text, lines, callback);
});
}
function summarize(text, lines, callback) {
var sentenceTokenizer = new Tokenizer('utterer');
sentenceTokenizer.setEntry(text);
var sentences = sentenceTokenizer.getSentences();
var sentencesOriginal = sentences.slice();
var wordTokenizer = new natural.TreebankWordTokenizer();
sentences.forEach(function (sentence, index, array) {
array[index] = wordTokenizer.tokenize(sentence.toLowerCase());
});
var matrix = constructMatrix(sentences);
var sortedSentences = pageRank(matrix, sentencesOriginal);
var topLines = [];
for (var i = 0; i < Math.min(lines, sortedSentences.length); i++) {
topLines.push(sortedSentences[i]);
}
topLines.sort(function (a, b) {
return a.index - b.index;
});
var concatenated = '';
for(var i = 0 ; i < topLines.length; i++) {
concatenated += topLines[i].text + ' ';
}
callback(false, topLines, concatenated);
}
function fillArray(length, value) {
var array = [];
for (var i = 0; i < length; i++) {
array[i] = value;
}
return array;
}
function pageRank(matrix, sentencesOriginal) {
var eigen = fillArray(sentencesOriginal.length, 1);
for (var h = 0; h < 10; h++) {
var w = fillArray(sentencesOriginal.length, 0);
for (var i = 0; i < sentencesOriginal.length; i++) {
for (var j = 0; j < sentencesOriginal.length; j++) {
w[i] = w[i] + (matrix[i][j] * eigen[j]);
}
}
eigen = normalize(w);
}
// sort bags according to eigen value
var eigenCounts = [];
for (var i = 0; i < sentencesOriginal.length; i++) {
var eigenObject = {
weight: eigen[i],
text: sentencesOriginal[i],
index: i
}
eigenCounts.push(eigenObject);
}
eigenCounts.sort(function (a, b) {
return b.weight - a.weight;
});
return eigenCounts;
}
function constructMatrix(sentences, threshold) {
var matrix = [];
for (var i = 0; i < sentences.length; i++) {
matrix[i] = [];
var sentenceA = sentences[i];
for (var j = 0; j < sentences.length; j++) {
var sentenceB = sentences[j];
var value = wuzzy.tanimoto(sentenceB, sentenceA);
if(!!threshold && value < threshold) {
value = 0;
}
matrix[i][j] = value;
}
matrix[i] = normalize(matrix[i]);
}
return matrix;
}
function normalize(array) {
var distance = 0;
for (var i = 0; i < array.length; i++) {
distance += array[i] * array[i];
}
distance = Math.sqrt(distance);
for (var i = 0; i < array.length; i++) {
array[i] = array[i] / distance;
}
return array;
}
module.exports = {
summarize: summarize,
summarizePage: summarizePage
}
})();