-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathreview.py
More file actions
133 lines (101 loc) · 3.75 KB
/
review.py
File metadata and controls
133 lines (101 loc) · 3.75 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
import xml.etree.ElementTree as Parser
import utils
from nltk.tokenize import word_tokenize as tokenize
class Opinion:
def __init__(self, target, category, polarity, frm, to):
self.target = target
self.category = category
self.ent = category.split('#')[0]
self.attr = category.split('#')[1]
self.polarity = polarity
self.frm = frm
self.to = to
class Review:
def __init__(self, string, opinions):
self.string = string
self.opinions = opinions
self.tokens = utils.filter_symbol(utils.to_lower(tokenize(string)))
self.ids = None
def load_semeval_reviews(filename):
reviews = []
sents = Parser.parse(filename).getroot().findall('.//sentence')
print 'len of <sentences>:', len(sents)
for i, sent in enumerate(sents):
text = sent.find('text').text
opinions = sent.find('Opinions')
if opinions is None:
continue
ops = []
for op in opinions.findall('Opinion'):
ops.append(Opinion(op.get('target'),
op.get('category'),
op.get('polarity'),
int(op.get('from')),
int(op.get('to'))))
if len(ops) >= 1:
reviews.append(Review(text, ops))
ents = set()
attrs = set()
pols = set()
for review in reviews:
for opinion in review.opinions:
ents.add(opinion.ent)
attrs.add(opinion.attr)
pols.add(opinion.polarity)
return reviews, \
{v: k for k, v in enumerate(ents)}, \
{v: k for k, v in enumerate(attrs)}, \
{v: k for k, v in enumerate(pols)},\
def ent_attr_to_words(reviews, word2idx):
"""
:param reviews:
:param word2idx: func
:param not_covered:
:return:
"""
from nltk.corpus import stopwords
from collections import defaultdict
import constants
not_covered = set(utils.load_semeval_words()) - set(utils.google_news_words())
stopwords = constants.stopwords
# e.g. {'FOOD': set(4, 6, 8)}
ent_map = defaultdict(set)
# e.g. {'QUALITY': set(2, 6, 9)}
attr_map = defaultdict(set)
for review in reviews:
# words -> ids for performance
review.ids = [word2idx[tok] for tok in review.tokens if tok not in stopwords]
# extract entities and attributes
ents, attrs = set(), set()
for opinion in review.opinions:
ents.add(opinion.ent)
attrs.add(opinion.attr)
# add ids to ent_map
for ent in ents:
for id_ in review.ids:
if id_ not in not_covered:
ent_map[ent].add(id_)
# add ids to attr_map
for attr in attrs:
for id_ in review.ids:
if id_ not in not_covered:
attr_map[attr].add(id_)
for k, v in ent_map.items():
print k, 'contains', len(v), 'words'
for k, v in attr_map.items():
print k, 'contains', len(v), 'words'
print '\n'
return ent_map, attr_map
def make_ent_attr_embedding(reviews, word2idx, id2vec, ent2idx, attr2idx):
import numpy as np
from operator import itemgetter
# e.g. {'FOOD': set(4, 6, 8)}
ent2words, attr2words = ent_attr_to_words(reviews, word2idx)
# sort by ent/attr ID
# [(X1, vec), (X2, vec), ...]
pairs1 = sorted([(ent2idx[e], reduce(np.add, map(id2vec, ws)))
for e, ws in ent2words.items()], key=itemgetter(0))
pairs2 = sorted([(attr2idx[a], reduce(np.add, map(id2vec, ws)))
for a, ws in attr2words.items()], key=itemgetter(0))
return np.array([pair[1] for pair in pairs1]), \
np.array([pair[1] for pair in pairs2])