11#!/usr/bin/env python3
22# -*- coding:utf-8 -*-
33
4+ from pathlib import Path
5+
6+ import pickle
47import re
58
69class TaiwanAddressAnalizer :
@@ -10,20 +13,24 @@ def __init__(self, locale=None):
1013 else :
1114 locale = "TW"
1215 self .addTWPat = re .compile ("(?<=<KNOWLEDGE_addTW>)[^<]*?(?=</KNOWLEDGE_addTW>)" )
13- self .TWaddPatDICT = {"countyPat" : "[^\s][^市]縣" ,
14- "cityPat" : "[^是在於及、,\s]{1,2}市" ,
15- "districtPat" : "那瑪夏區|[^市及、,\s]+?.社?區|[東西南北中]區" ,
16- "townshipPat" : "(?:阿里山|三地門|太麻里)鄉|..鄉|[^縣].里(?!區)" ,
17- "townPat" : "[^\s][^\s]鎮" ,
18- "villagePat" : "(?:(?<=[縣市區鄉鎮路段])(?:[^\s縣市區鄉鎮路段]+)?新?村|[^\s][^\s]新?村)(?!路)" ,
19- "neighborhoodPat" : "(?:\s?[零一二三四五六七八九十廿卅0-9\d]+?\s?鄰)" ,
20- "roadPat" : "市府路|市[政場](?:[北南中]|[1-7一二三四五六七]){0,2}路|市港[^\s]路|美村路|[新環村盛果]市[^\s]?[路街]|市民大道|市宅街|[埔菜美元西]市[路街]|(?<=[縣市區鄉鎮里村鄰])[^市區鄉鎮村路及鄰、,]{1,4}(?:[路街](?!里)|大道)(?:[零一二三四五六七八九十廿卅百0-9\d]+?街)?" ,
21- "sectionPat" : "\s?[零一二三四五六七八九十廿卅百0-9\d]*?\s?段" ,
22- "alleyPat" : "(?:國中|市場|新市.|七里溪|(?:[^縣市區鄉鎮里村路街段]{1,2}|鐵路)[零一二三四五六七八九十廿卅百0-9\d]*?|\s?[零一二三四五六七八九十廿卅百0-9\d]*?)\s?巷(?:[零一二三四五六七八九十廿卅百0-9\d]*?\s?弄)?" ,
23- "numberPat" : "(?:\s?[零一二三四五六七八九十廿卅百0-9\d]*?\s?[之\-]\s?)?\s?[零一二三四五六七八九十廿卅百0-9\d]*?\s?號(?:[之\-]\s?[零一二三四五六七八九十廿卅百0-9\d]+?)?" ,
24- "floorPat" : "\s?[零一二三四五六七八九十廿卅百0-9\d]*?\s?[fF樓]" ,
25- "roomPat" : "\s?(?:[a-zA-Z零一二三四五六七八九十廿卅百\d0-9]+?)\s?(室?$)" }
16+ self .TWaddPatDICT = {"countyPat" : r"[^\s][^市]縣" ,
17+ "cityPat" : r"[^是在於及、,\s]{1,2}市" ,
18+ "districtPat" : r"那瑪夏區|[^市及、,\s]+?.社?區|[東西南北中]區" ,
19+ "townshipPat" : r"(?:阿里山|三地門|太麻里)鄉|..鄉|[^縣].里(?!區)" ,
20+ "townPat" : r"[^\s][^\s]鎮" ,
21+ "villagePat" : r"(?:(?<=[縣市區鄉鎮鄰路街段])(?:[^\s縣市區鄉鎮鄰路街段]+)?新?村|[^\s][^\s]新?村)(?!路)" ,
22+ "neighborhoodPat" : r"(?:\s?[零一二三四五六七八九十廿卅0-9\d]+?\s?鄰)" ,
23+ "roadPat" : r"市[政場](?:[北南中]|[1-7一二三四五六七]){0,2}路|" \
24+ r"市府路|市港[^\s]路|[新環村盛果]市[^\s]?[路街]|美村路|市民大道|市宅街|[埔菜美元西]市[路街]|" \
25+ r"(?<=[縣市區鄉鎮里村鄰])(?:{{STRANGE_ROAD}}[^市區鄉鎮村路及鄰、,]{1,4}(?:[路街](?!里)|大道)(?:[零一二三四五六七八九十廿卅百0-9\d]+?街)?)" ,
26+ "sectionPat" : r"\s?[零一二三四五六七八九十廿卅百0-9\d]*?\s?段" ,
27+ "alleyPat" : r"(?:國中|市場|新市.|七里溪|(?:[^縣市區鄉鎮里村路街段]{1,2}|鐵路)[零一二三四五六七八九十廿卅百0-9\d]*?|\s?[零一二三四五六七八九十廿卅百0-9\d]*?)\s?巷(?:[零一二三四五六七八九十廿卅百0-9\d]*?\s?弄)?" ,
28+ "numberPat" : r"(?:\s?[零一二三四五六七八九十廿卅百0-9\d]*?\s?[之\-]\s?)?\s?[零一二三四五六七八九十廿卅百0-9\d]*?\s?號(?:[之\-]\s?[零一二三四五六七八九十廿卅百0-9\d]+?)?" ,
29+ "floorPat" : r"\s?[零一二三四五六七八九十廿卅百0-9\d]*?\s?[fF樓]" ,
30+ "roomPat" : r"\s?(?:[a-zA-Z零一二三四五六七八九十廿卅百\d0-9]+?)\s?(?:室?$)" }
2631 self .stripPat = re .compile ("(?<=>).*?(?=<)" )
32+ basePath = Path (__file__ ).resolve ().parent
33+ self .stragerRoadPatDICT = pickle .loads (open (f"{ basePath } /data/strangerRoad.pj" , "rb" ).read ())
2734
2835 def _addIndexConverter (self , ArticutResultDICT , addIndexLIST ):
2936 '''
@@ -193,11 +200,19 @@ def getAddressRoad(self, ArticutResultDICT, indexWithPOS=True):
193200 if type (ArticutResultDICT ) is list :
194201 ArticutResultLIST = self .mergeBulkResult (ArticutResultDICT )
195202 for i , x in enumerate (ArticutResultLIST ):
196- resultAppend (self ._getAddLIST (x , self .TWaddPatDICT ["roadPat" ]))
203+ sRoadReSTR = ""
204+ wordSET = set ("" .join (x ["result_segmentation" ])).intersection (self .stragerRoadPatDICT )
205+ if wordSET :
206+ sRoadReSTR = fr"(?:{ '|' .join ([self .stragerRoadPatDICT [w ] for w in wordSET if w in self .stragerRoadPatDICT ])} )(?=\s?\d+(?:-\d+)?\s?號)|"
207+ resultAppend (self ._getAddLIST (x , self .TWaddPatDICT ["roadPat" ].replace ("{{STRANGE_ROAD}}" , sRoadReSTR )))
197208 if not indexWithPOS and resultLIST :
198209 resultLIST [i ] = self ._addIndexConverter (x , resultLIST )
199210 else :
200- resultLIST = self ._getAddLIST (ArticutResultDICT , self .TWaddPatDICT ["roadPat" ])
211+ sRoadReSTR = ""
212+ wordSET = set ("" .join (ArticutResultDICT ["result_segmentation" ])).intersection (self .stragerRoadPatDICT )
213+ if wordSET :
214+ sRoadReSTR = fr"(?:{ '|' .join ([self .stragerRoadPatDICT [w ] for w in wordSET if w in self .stragerRoadPatDICT ])} )(?=\s?\d+(?:-\d+)?\s?號)|"
215+ resultLIST = self ._getAddLIST (ArticutResultDICT , self .TWaddPatDICT ["roadPat" ].replace ("{{STRANGE_ROAD}}" , sRoadReSTR ))
201216 if not indexWithPOS and resultLIST :
202217 resultLIST = self ._addIndexConverter (ArticutResultDICT , resultLIST )
203218 return resultLIST
0 commit comments