french-language-tools/lexicon.py at master · Kartmaan/french-language-tools · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
import time
import logging
import ast
from typing import Optional, Union

import pandas as pd
from openpyxl import load_workbook, Workbook

# ===================================================================
#                          LOGGING INIT
# ===================================================================
logger = logging.getLogger()
logger.setLevel(logging.WARNING)

# ===================================================================
#                         DATAFRAME CREATION
# ===================================================================
print("Main dataframe creation...")
dict_df = pd.read_csv("files/dico.csv")
dict_df = dict_df.sort_values("Mot")
dict_df = dict_df.dropna()
dict_df = dict_df.reset_index(drop=True)

# ===================================================================
#                         EXCEL FILE INIT
# ===================================================================
# Opening the Excel file
excel_file = load_workbook("files/lexi.xlsx")
sheet = excel_file.active

# Writing of the first line: column names
sheet["A1"] = "Mot"
sheet["B1"] = "Definitions"
sheet["C1"] = "Timestamp"

# ===================================================================
#                            FUNCTIONS
# ===================================================================
# In order to reduce the risks associated with too long or potentially
# infinite iterations of the lexicon
LIMIT_ITER = 10000

def first_empty(workbook: Workbook, column: str = 'A'):
    """ Returns the index of the first empty cell in a column. This allows to target the cell on which to write.

    Args:
        workbook (Workbook): Workbook object (openpyxl) referring to the spreadsheet.
        column (str): Letter of the column to be analyzed to locate the first empty cell.
    """

    idx = 2
    while True:
        # Cell is empty
        if not isinstance(workbook[f'{column}{idx}'].value, str):
            return idx

        # Cell is full
        else:
            idx += 1
            if idx == LIMIT_ITER:
                logging.critical("Too many iterations")
                break
            else:
                continue

def add_word(dataframe: pd.DataFrame, workbook: Workbook, word: str) -> Optional[None]:
    """ Addition of a word in the lexicon which is present in the dictionary.

    Args:
        dataframe (pandas.Dataframe): Dictionary dataframe.
        workbook (Workbook): Workbook object (openpyxl) referring to the spreadsheet.
        word (str): Word present in the dictionary to add in the lexicon.

    Returns:
        None: Not in dictionary or already in lexicon.
    """

    word = word.capitalize()

    definition = dataframe.loc[dataframe['Mot'] == word]['Définitions']

    # Word not found
    if len(definition) == 0:
        logging.warning(f"'{word}' not in dictionary")
        return None

    # Word found
    else:
        logging.info(f"'{word}' found in dictionary at idx {dataframe.loc[dataframe['Mot'] == word].index[0]}")
        definition = ast.literal_eval(definition.values[0])  # -> <list>

        idx = 2  # Index vertical de la colonne

        # Loop to first empty cell in lexicon
        while idx <= first_empty(workbook):

            # Word already present in the lexicon
            if workbook[f"A{idx}"].value == word:
                logging.warning(f"'{word}' not added : already in lexicon at cell A{idx}")
                return None

            # Cell is empty
            if not isinstance(workbook[f"A{idx}"].value, (str, int, float)):
                # All the definitions of the word contained in the list (D) are merged
                # into a single string in order to be inserted into the sheet cell
                definition_list = []
                for idx_def, definition in enumerate(definition):
                    definition_list.append(f"{idx_def + 1}) {definition}")

                workbook[f"A{str(idx)}"] = word  # Add word

                workbook[f"B{str(idx)}"] = "".join(definition_list)  # Add definitions

                workbook[f"C{str(idx)}"] = str(int(time.time()))  # Add timestamp

                logging.info(f"'{word}' added in lexicon at idx A{idx}")
                break

            else:
                idx += 1
                if idx == LIMIT_ITER:  # watchdog
                    logging.critical("Watchdog : too many iterations")
                    break

    excel_file.save("files/lexi.xlsx")  # Save file after modification
    logging.debug("xlsx file saved after adding a word")
    print(f"The word '{word}' has been added to the lexicon.")

def search(workbook: Workbook, word: str, log: bool = True) -> Optional[tuple]:
    """ Search a word in the lexicon.

    If the word is found in the lexicon, the function returns in a tuple :
    (the index, the word, its definitions, the timestamp of the addition). If no word was found, returns None.

    Args:
        workbook (Workbook): Workbook object (openpyxl) referring to the spreadsheet.
        word (str): Word to search.
        log (bool): A word that can't be found is considered a warning logging by default, but it's not always useful
        to display the error message when this function is actually used to check whether a word is missing from the
        lexicon.

    Returns:
        tuple: Word found. (index, word, definitions, timestamp).
        None: Word not found.
    """

    word = word.capitalize()

    idx = 2
    found = False
    while idx < first_empty(workbook):
        # Word found
        if workbook[f'A{idx}'].value == word:
            logging.info(f"'{word}' found in lexicon at idx {idx}")
            return idx, word, workbook[f'B{idx}'].value, workbook[f'C{idx}'].value

        # Still searching
        else:
            idx += 1

    # End of itération : word not found
    if not found:
        if log:
            logging.warning(f"'{word}' not found in lexicon")
        return None

def delete(workbook: Workbook, word: str) -> Optional[None]:
    """ Delete a word from the lexicon.

    The delete_rows() function from openpyxl automatically fills the void left by the deletion.

    Args:
        workbook (Workbook): Workbook object (openpyxl) referring to the spreadsheet.
        word (str): Word to delete.

    Returns:
        None: Word not in lexicon.
    """

    word = word.capitalize()

    result = search(workbook, word, log=False) # The function returns None if no word was found.

    # Word not found
    if result is None:
        logging.warning(f"Word '{word}' not deleted : not in lexicon")
        return None

    # Word found
    else:
        idx = result[0]
        workbook.delete_rows(idx)
        logging.info(f"'{word}' deleted from the lexicon")
        excel_file.save("files/lexi.xlsx")
        logging.debug("xlsx file saved after deletion")
        print(f"The word '{word}' has been deleted from the lexicon.")

def insert(workbook: Workbook, word: str, definition: Union[str, list]) -> Optional[None]:
    """ Manual addition of a word and its definition to the lexicon.

    Args:
        workbook (Workbook): Workbook object (openpyxl) referring to the spreadsheet.
        word (str): Word to insert.
        definition (str or list): Word definition. If there are several definitions, they can be placed in a list.

    Returns:
        None: Word already in lexicon or wrong definition type.
    """

    word = word.capitalize()

    result = search(workbook, word, log=False)  # returns None if no word was found

    # Word already present
    if result is not None:
        idx = result[0]
        logging.warning(f"Word '{word}' already in lexicon at idx {idx}")
        return None

    # Word isn't present
    else:
        row = first_empty(workbook)  # Where to write

        definition_txt = ""

        # 'definition' is a list or a tuple with multiple definitions.
        # Numbering of definitions and merging in a string
        if isinstance(definition, (list, tuple)):
            for idx, d in enumerate(definition):
                definition_txt += f"{idx + 1}) {d} "
                workbook[f'A{row}'] = word
                workbook[f'B{row}'] = definition_txt
                workbook[f'C{row}'] = str(int(time.time()))

        # 'definition' is a string
        elif isinstance(definition, str):
            definition_txt = "1) " + definition
            workbook[f'A{row}'] = word
            workbook[f'B{row}'] = definition_txt
            workbook[f'C{row}'] = str(int(time.time()))

        # Wrong type for 'definition'
        else:
            logging.critical(f"Word '{word}' not inserted : definition must be str or strs in list/tuple")
            return None

        excel_file.save("files/lexi.xlsx")
        logging.debug("xlsx file saved after insertion")
        print(f"The word '{word}' has been added to the lexicon.")

def add_random_words(dataframe: pd.DataFrame, workbook: Workbook, sample_length: int, seed=None):
    """Adds a given number of random words to the lexicon. Mainly for test purposes.

    Args:
          dataframe (pandas.Dataframe): Dictionary dataframe.
          workbook (Workbook): Workbook object (openpyxl) referring to the spreadsheet.
          sample_length (int): Number of words to add.
          seed (int): Random seed.
    """

    if seed is None:
        sample = dataframe.sample(sample_length)
    else:
        sample = dataframe.sample(sample_length, random_state=seed)

    for idx, word in enumerate(sample["Mot"]):
        add_word(dict_df, workbook, word)

    logging.debug(f"{sample_length} words have been added in the lexicon")
    print(f"{sample_length} words were added to the lexicon.")

    excel_file.save("files/lexi.xlsx")
    logging.debug("xlsx file saved after adding random sample")

if __name__ == "__main__":
    add_random_words(dict_df, sheet, 10, seed=42) # Insert 10 randomly chosen words in the dictionary

    add_word(dict_df, sheet, "manga") # Add a word from the dictionary
    add_word(dict_df, sheet, "rompicher") # Add a word missing from the dictionary (Logging warning)

    # Manual addition of a word to the lexicon
    insert(sheet, "Valorant", "Jeu de tir à la première personne.")
    insert(sheet, "Luffy", "Personnage principal du manga One Piece.")

    word_to_search = search(sheet, "Luffy") # Find a word in the lexicon
    print(word_to_search)

    delete(sheet, "Valorant") # Deleting a word from the lexicon