-
Notifications
You must be signed in to change notification settings - Fork 540
/
Copy pathnbsvm.py
60 lines (50 loc) · 1.43 KB
/
nbsvm.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
# Naive-Bayes features
# Derived from https://github.com/mesnilgr/nbsvm
import os
import pdb
import numpy as np
from collections import Counter
from scipy.sparse import lil_matrix
from scipy.sparse import csr_matrix
def tokenize(sentence, grams):
words = sentence.split()
tokens = []
for gram in grams:
for i in range(len(words) - gram + 1):
tokens += ["_*_".join(words[i:i+gram])]
return tokens
def build_dict(X, grams):
dic = Counter()
for sentence in X:
dic.update(tokenize(sentence, grams))
return dic
def compute_ratio(poscounts, negcounts, alpha=1):
alltokens = list(set(poscounts.keys() + negcounts.keys()))
dic = dict((t, i) for i, t in enumerate(alltokens))
d = len(dic)
p, q = np.ones(d) * alpha , np.ones(d) * alpha
for t in alltokens:
p[dic[t]] += poscounts[t]
q[dic[t]] += negcounts[t]
p /= abs(p).sum()
q /= abs(q).sum()
r = np.log(p/q)
return dic, r
def process_text(text, dic, r, grams):
"""
Return sparse feature matrix
"""
X = lil_matrix((len(text), len(dic)))
for i, l in enumerate(text):
tokens = tokenize(l, grams)
indexes = []
for t in tokens:
try:
indexes += [dic[t]]
except KeyError:
pass
indexes = list(set(indexes))
indexes.sort()
for j in indexes:
X[i,j] = r[j]
return csr_matrix(X)