Skip to content

Commit 5361807

Browse files
committed
調整 localRE pat, 更新 getAddressRoad()
1 parent 0a647a4 commit 5361807

4 files changed

Lines changed: 32 additions & 16 deletions

File tree

29.6 KB
Binary file not shown.

ArticutAPI/Toolkit/localRE.py

Lines changed: 30 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,9 @@
11
#!/usr/bin/env python3
22
# -*- coding:utf-8 -*-
33

4+
from pathlib import Path
5+
6+
import pickle
47
import re
58

69
class TaiwanAddressAnalizer:
@@ -10,20 +13,24 @@ def __init__(self, locale=None):
1013
else:
1114
locale="TW"
1215
self.addTWPat = re.compile("(?<=<KNOWLEDGE_addTW>)[^<]*?(?=</KNOWLEDGE_addTW>)")
13-
self.TWaddPatDICT = {"countyPat" : "[^\s][^市]縣",
14-
"cityPat" : "[^是在於及、,\s]{1,2}市",
15-
"districtPat" : "那瑪夏區|[^市及、,\s]+?.社?區|[東西南北中]區",
16-
"townshipPat" : "(?:阿里山|三地門|太麻里)鄉|..鄉|[^縣].里(?!區)",
17-
"townPat" : "[^\s][^\s]鎮",
18-
"villagePat" : "(?:(?<=[縣市區鄉鎮路段])(?:[^\s縣市區鄉鎮路段]+)?新?村|[^\s][^\s]新?村)(?!路)",
19-
"neighborhoodPat": "(?:\s?[零一二三四五六七八九十廿卅0-9\d]+?\s?鄰)",
20-
"roadPat" : "市府路|市[政場](?:[北南中]|[1-7一二三四五六七]){0,2}路|市港[^\s]路|美村路|[新環村盛果]市[^\s]?[路街]|市民大道|市宅街|[埔菜美元西]市[路街]|(?<=[縣市區鄉鎮里村鄰])[^市區鄉鎮村路及鄰、,]{1,4}(?:[路街](?!里)|大道)(?:[零一二三四五六七八九十廿卅百0-9\d]+?街)?",
21-
"sectionPat" : "\s?[零一二三四五六七八九十廿卅百0-9\d]*?\s?段",
22-
"alleyPat" : "(?:國中|市場|新市.|七里溪|(?:[^縣市區鄉鎮里村路街段]{1,2}|鐵路)[零一二三四五六七八九十廿卅百0-9\d]*?|\s?[零一二三四五六七八九十廿卅百0-9\d]*?)\s?巷(?:[零一二三四五六七八九十廿卅百0-9\d]*?\s?弄)?",
23-
"numberPat" : "(?:\s?[零一二三四五六七八九十廿卅百0-9\d]*?\s?[之\-]\s?)?\s?[零一二三四五六七八九十廿卅百0-9\d]*?\s?號(?:[之\-]\s?[零一二三四五六七八九十廿卅百0-9\d]+?)?",
24-
"floorPat" : "\s?[零一二三四五六七八九十廿卅百0-9\d]*?\s?[fF樓]",
25-
"roomPat" : "\s?(?:[a-zA-Z零一二三四五六七八九十廿卅百\d0-9]+?)\s?(室?$)"}
16+
self.TWaddPatDICT = {"countyPat" : r"[^\s][^市]縣",
17+
"cityPat" : r"[^是在於及、,\s]{1,2}市",
18+
"districtPat" : r"那瑪夏區|[^市及、,\s]+?.社?區|[東西南北中]區",
19+
"townshipPat" : r"(?:阿里山|三地門|太麻里)鄉|..鄉|[^縣].里(?!區)",
20+
"townPat" : r"[^\s][^\s]鎮",
21+
"villagePat" : r"(?:(?<=[縣市區鄉鎮鄰路街段])(?:[^\s縣市區鄉鎮鄰路街段]+)?新?村|[^\s][^\s]新?村)(?!路)",
22+
"neighborhoodPat": r"(?:\s?[零一二三四五六七八九十廿卅0-9\d]+?\s?鄰)",
23+
"roadPat" : r"市[政場](?:[北南中]|[1-7一二三四五六七]){0,2}路|" \
24+
r"市府路|市港[^\s]路|[新環村盛果]市[^\s]?[路街]|美村路|市民大道|市宅街|[埔菜美元西]市[路街]|" \
25+
r"(?<=[縣市區鄉鎮里村鄰])(?:{{STRANGE_ROAD}}[^市區鄉鎮村路及鄰、,]{1,4}(?:[路街](?!里)|大道)(?:[零一二三四五六七八九十廿卅百0-9\d]+?街)?)",
26+
"sectionPat" : r"\s?[零一二三四五六七八九十廿卅百0-9\d]*?\s?段",
27+
"alleyPat" : r"(?:國中|市場|新市.|七里溪|(?:[^縣市區鄉鎮里村路街段]{1,2}|鐵路)[零一二三四五六七八九十廿卅百0-9\d]*?|\s?[零一二三四五六七八九十廿卅百0-9\d]*?)\s?巷(?:[零一二三四五六七八九十廿卅百0-9\d]*?\s?弄)?",
28+
"numberPat" : r"(?:\s?[零一二三四五六七八九十廿卅百0-9\d]*?\s?[之\-]\s?)?\s?[零一二三四五六七八九十廿卅百0-9\d]*?\s?號(?:[之\-]\s?[零一二三四五六七八九十廿卅百0-9\d]+?)?",
29+
"floorPat" : r"\s?[零一二三四五六七八九十廿卅百0-9\d]*?\s?[fF樓]",
30+
"roomPat" : r"\s?(?:[a-zA-Z零一二三四五六七八九十廿卅百\d0-9]+?)\s?(?:室?$)"}
2631
self.stripPat = re.compile("(?<=>).*?(?=<)")
32+
basePath = Path(__file__).resolve().parent
33+
self.stragerRoadPatDICT = pickle.loads(open(f"{basePath}/data/strangerRoad.pj", "rb").read())
2734

2835
def _addIndexConverter(self, ArticutResultDICT, addIndexLIST):
2936
'''
@@ -193,11 +200,19 @@ def getAddressRoad(self, ArticutResultDICT, indexWithPOS=True):
193200
if type(ArticutResultDICT) is list:
194201
ArticutResultLIST = self.mergeBulkResult(ArticutResultDICT)
195202
for i, x in enumerate(ArticutResultLIST):
196-
resultAppend(self._getAddLIST(x, self.TWaddPatDICT["roadPat"]))
203+
sRoadReSTR = ""
204+
wordSET = set("".join(x["result_segmentation"])).intersection(self.stragerRoadPatDICT)
205+
if wordSET:
206+
sRoadReSTR = fr"(?:{'|'.join([self.stragerRoadPatDICT[w] for w in wordSET if w in self.stragerRoadPatDICT])})(?=\s?\d+(?:-\d+)?\s?號)|"
207+
resultAppend(self._getAddLIST(x, self.TWaddPatDICT["roadPat"].replace("{{STRANGE_ROAD}}", sRoadReSTR)))
197208
if not indexWithPOS and resultLIST:
198209
resultLIST[i] = self._addIndexConverter(x, resultLIST)
199210
else:
200-
resultLIST = self._getAddLIST(ArticutResultDICT, self.TWaddPatDICT["roadPat"])
211+
sRoadReSTR = ""
212+
wordSET = set("".join(ArticutResultDICT["result_segmentation"])).intersection(self.stragerRoadPatDICT)
213+
if wordSET:
214+
sRoadReSTR = fr"(?:{'|'.join([self.stragerRoadPatDICT[w] for w in wordSET if w in self.stragerRoadPatDICT])})(?=\s?\d+(?:-\d+)?\s?號)|"
215+
resultLIST = self._getAddLIST(ArticutResultDICT, self.TWaddPatDICT["roadPat"].replace("{{STRANGE_ROAD}}", sRoadReSTR))
201216
if not indexWithPOS and resultLIST:
202217
resultLIST = self._addIndexConverter(ArticutResultDICT, resultLIST)
203218
return resultLIST

MANIFEST.in

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1 +1,2 @@
11
include ArticutAPI/Toolkit/data/*json
2+
include ArticutAPI/Toolkit/data/strangerRoad.pj

setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55

66
setuptools.setup(
77
name="ArticutAPI",
8-
version="1.3.7",
8+
version="1.3.8",
99
author="Droidtown Linguistic Tech. Co. Ltd.",
1010
author_email="info@droidtown.co",
1111
description="Articut NLP system provides not only finest results on Chinese word segmentaion (CWS), Part-of-Speech tagging (POS) and Named Entity Recogintion tagging (NER), but also the fastest online API service in the NLP industry.",

0 commit comments

Comments
 (0)