document-search-engine/util.py at main · TiffanyKousiman/document-search-engine · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
import glob, os, re
import string
from stemming.porter2 import stem

class BowDoc:
    """Bag-of-words representation of a document.

    The document has an ID, and an iterable list of terms with their
    frequencies."""

    def __init__(self, docid):
        """Constructor.

        Set the ID of the document, and initiate an empty term dictionary.
        Call add_term to add terms to the dictionary."""
        self.docid = docid
        self.terms = {}
        self.doc_len = 0

    def add_term(self, term):
        """Add a term occurrence to the BOW representation.

        This should be called each time the term occurs in the document."""
        try:
            self.terms[term] += 1
        except KeyError:
            self.terms[term] = 1

    def get_term_count(self, term):
        """Get the term occurrence count for a term.

        Returns 0 if the term does not appear in the document."""
        try:
            return self.terms[term]
        except KeyError:
            return 0

    def get_term_freq_dict(self):
        """Return dictionary of term:freq pairs."""
        return self.terms

    def get_term_list(self):
        """Get sorted list of all terms occurring in the document."""
        return sorted(self.terms.keys())

    def get_docid(self):
        """Get the ID of the document."""
        return self.docid

    def __iter__(self):
        """Return an ordered iterator over term--frequency pairs.

        Each element is a (term, frequency) tuple.  They are iterated
        in term's frequency descending order."""
        return iter(sorted(self.terms.items(), key=lambda x: x[1],reverse=True))
        """Or in term alphabetical order:
        return iter(sorted(self.terms.iteritems()))"""

    def get_doc_len(self):
        return self.doc_len

    def set_doc_len(self, doc_len):
        self.doc_len = doc_len

class BowColl:
    """Collection of BOW documents."""

    def __init__(self):
        """Constructor.

        Creates an empty collection."""
        self.docs = {}
        self.doc_freq = {} # term : doc frequency pairs
        self.cum_tf = {}
        self.avg_doc_len = 0
        self.total_doc_len = 0

    def add_doc(self, doc):
        """Add a document to the collection."""
        self.docs[doc.get_docid()] = doc

    def get_doc(self, docid):
        """Return a document by docid.

        Will raise a KeyError if there is no document with that ID."""
        return self.docs[docid]

    def get_docs(self):
        """Get the full list of documents.

        Returns a dictionary, with docids as keys, and docs as values."""
        return self.docs

    def inorder_iter(self):
        """Return an ordered iterator over the documents.

        The iterator will traverse the collection in docid order.  Modifying
        the collection while iterating over it leads to undefined results.
        Each element is a document; to find the id, call doc.get_docid()."""
        return BowCollInorderIterator(self)

    def get_num_docs(self):
        """Get the number of documents in the collection."""
        return len(self.docs)

    def __iter__(self):
        """Iterator interface.

        See inorder_iter."""
        return self.inorder_iter()

    def set_doc_freq(self, doc_freq):
        """Set dictionary of term:df pairs for all terms in the collection"""
        self.doc_freq = doc_freq

    def get_doc_freq(self):
        """Return term : document frequency dictionary"""
        return self.doc_freq

    def set_cum_freq(self, cum_tf):
        """Set dictionary of term:cum_tf pairs for all terms in the collection"""
        self.cum_tf = cum_tf

    def get_cum_freq(self):
        """Return term : cummulative term frequency dictionary"""
        return self.cum_tf

    def set_avg_doc_len(self, avg_len):
        """Set average document length of all documents in the collection"""
        self.avg_doc_len = avg_len

    def get_avg_doc_len(self):
        """Return average document length"""
        return self.avg_doc_len

    def set_total_doc_len(self, total_len):
        """Set total document length of all documents in the collection"""
        self.total_doc_len = total_len

    def get_total_doc_len(self):
        """Return total document length"""
        return self.total_doc_len

class BowQuery:
    """Bag-of-words representation of a query.

    Each query has a number and an iterable list of query terms with their
    frequencies."""

    def __init__(self, q_id):
        """Constructor.

        Set the ID of the query, and initiate an empty term dictionary.
        Call add_term to add terms to the dictionary."""
        self.id = q_id
        self.terms = {}

    def add_term(self, term):
        """Add a term occurrence to the BOW representation.

        This should be called each time the term occurs in the query."""
        try:
            self.terms[term] += 1
        except KeyError:
            self.terms[term] = 1

    def get_term_count(self, term):
        """Get the term occurrence count for a term.

        Returns 0 if the term does not appear in the query."""
        try:
            return self.terms[term]
        except KeyError:
            return 0

    def get_term_freq_dict(self):
        """Return dictionary of term:freq pairs."""
        return self.terms

    def get_term_list(self):
        """Get sorted list of all terms occurring in the query."""
        return sorted(self.terms.keys())

    def get_id(self):
        """Get the ID of the query."""
        return self.id

    def __iter__(self):
        """Return an ordered iterator over term--frequency pairs.

        Each element is a (term, frequency) tuple.  They are iterated
        in term's frequency descending order."""
        return iter(sorted(self.terms.items(), key=lambda x: x[1],reverse=True))
        """Or in term alphabetical order:
        return iter(sorted(self.terms.iteritems()))"""

class QueryColl:
    """Collection of BOW queries."""

    def __init__(self):
        """Constructor.

        Creates an empty collection."""
        self.queries = {}

    def add_query(self, query):
        """Add a query to the collection."""
        self.queries[query.get_id()] = query

    def get_query(self, q_id):
        """Return a query by id"""
        return self.queries[q_id]

    def get_queries(self):
        """Get the full list of queries.
        Returns a dictionary, with id as keys, and query as values."""
        return self.queries

class BowCollInorderIterator:
    """Iterator over a collection."""

    def __init__(self, coll):
        """Constructor.

        Takes the collection we're going to iterator over as sole argument."""
        self.coll = coll
        self.keys = sorted(coll.get_docs().keys())
        self.i = 0

    def __iter__(self):
        """Iterator interface."""
        return self

    def next(self):
        """Get next element."""
        if self.i >= len(self.keys):
            raise StopIteration
        doc = self.coll.get_doc(self.keys[self.i])
        self.i += 1
        return doc

def parse_rcv_coll(inputpath, stop_words):
    """Parse an RCV1 data files into a collection.

    inputpath is the folder name of the RCV1 data files.  The parsed collection
    is returned.  NOTE the function performs very limited error checking."""
    #stopwords = open('common-english-words.txt', 'r')

    coll = BowColl()
    os.chdir(inputpath)
    for file_ in glob.glob("*.xml"):
        curr_doc = None
        start_end = False
        word_count = 0
        for line in open(file_):
            line = line.strip()
            if(start_end == False):
                if line.startswith("<newsitem "):
                    for part in line.split():
                        if part.startswith("itemid="):
                            docid = part.split("=")[1].split("\"")[1]
                            curr_doc = BowDoc(docid)
                            break
                    continue
                if line.startswith("<text>"):
                    start_end = True
            elif line.startswith("</text>"):
                break
            elif curr_doc is not None:
                line = line.replace("<p>", "").replace("</p>", "")
                line = re.sub("\\s+", " ", line)
                line = line.translate(str.maketrans('','', string.digits)).\
                    translate(str.maketrans(string.punctuation, ' '*len(string.punctuation)))

                for term in line.split():
                    word_count += 1
                    term = stem(term.lower())
                    if len(term) > 2 and term not in stop_words:
                        curr_doc.add_term(term)
        if curr_doc is not None:
            curr_doc.set_doc_len(word_count)
            coll.add_doc(curr_doc)

    return coll

def parse_query(query_file, stop_words):

    """
    Parse all queries in a file into a query collection.
    query term is only indexed if it is at least 3 or more chars long, and not a stop word.

    Params
    -------
    query_file                  : a file of queries
    stop_words (list of str)    : a list of stop words
    """

    query_coll = QueryColl()
    curr_query = None

    for line in open(query_file):

        # get query_id and use it to instantiate query object
        if line.startswith("<num>"):
            id_ = line.split("<num>")[1].split(':')[1].rstrip().strip()
            curr_query = BowQuery(id_)
        # get query terms and frequency from the title tag only
        if line.startswith("<title>"):                              # Split after the title
            title = line.split("<title>")[1].rstrip().strip()       # Take away any trailing newline characters and also any excessive white space
            title = re.sub("\\s+", " ", title)                      # standardize whitespaces
            title = title.translate(str.maketrans('','', string.digits)).\
                translate(str.maketrans(string.punctuation, ' '*len(string.punctuation)))  # discard punctuations and number
            for term in title.split():
                term = stem(term.lower()) # stemming and convert all chars to lower-case
                # add words with qualified qualities defined in the docstring (at least 3 characters and not a stop word) to `docWord`
                if len(term) > 2 and term not in stop_words:
                    curr_query.add_term(term)

        if curr_query is not None:
            query_coll.add_query(curr_query)

    return query_coll

def calc_df(coll):
    """
    calculate document-frequency (df) for a given BowColl collection
    and return a {term:df, ...} dictionary
    """
    df_ = {}

    for _, doc in coll.get_docs().items():
        for term in doc.get_term_list():
            try:
                df_[term] += 1
            except KeyError:
                df_[term] = 1

    return df_

def calc_cum_tf(coll):

    """calculate the cumulative term frequency for all terms in the BowColl collection
    and return a {term: cum_freq, ...} dictionary """

    cum_tf = {}

    for _, doc in coll.get_docs().items():
        for term, tf in doc.get_term_freq_dict().items():
            try:
                cum_tf[term] += tf
            except KeyError:
                cum_tf[term] = tf

    return cum_tf

def total_doc_len(coll):
    """
    calculate total document lengths of all documents in the collection
    """
    total_dl = 0
    for _, doc in coll.get_docs().items():
        total_dl = total_dl + doc.get_doc_len()

    return total_dl

def avg_doc_len(coll):
    """
    calculate the average document lengths of all documents in the collection.
    """
    return total_doc_len(coll)/coll.get_num_docs()