From 90a6188cb3525dcf42fcddc2e3546f0517ef3b33 Mon Sep 17 00:00:00 2001 From: celelstine Date: Fri, 27 Oct 2017 11:13:29 +0100 Subject: [PATCH 1/4] add vscode config to git ignore --- .gitignore | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.gitignore b/.gitignore index c3842d1..0628d1c 100644 --- a/.gitignore +++ b/.gitignore @@ -6,3 +6,7 @@ .*env venv *.DS_Store + +# visual studio code config +.vscode + From 39d174f7b27a377905e5025b6ac91b97586d537e Mon Sep 17 00:00:00 2001 From: celelstine Date: Fri, 27 Oct 2017 11:14:24 +0100 Subject: [PATCH 2/4] send slack notification after 30 minutes of scraping --- scraper.py | 34 ++++++++++++++++++++++++++++++++-- 1 file changed, 32 insertions(+), 2 deletions(-) diff --git a/scraper.py b/scraper.py index a1adfa5..8b56515 100644 --- a/scraper.py +++ b/scraper.py @@ -9,10 +9,24 @@ logging.basicConfig(level=logging.INFO) +import time +scraper_id = 0 +# error message for +error = { + "ERROR": "scrapers()", + "SOURCE": "Scraper time tracker", + "MESSAGE": "" +} +# create a scrapper to log error message +scraper = Scraper() -if __name__ == "__main__": - +def scrapers(): + ''' + Function to run every scraper + ''' + # record the start time + start_time = time.time() # Initialize the Scrapers doctors_scraper = DoctorsScraper() foreign_doctors_scraper = ForeignDoctorsScraper() @@ -59,3 +73,19 @@ nhif_inpatient_result = nhif_inpatient_scraper.run_scraper() nhif_outpatient_result = nhif_outpatient_scraper.run_scraper() nhif_outpatient_cs_result = nhif_outpatient_cs_scraper.run_scraper() + + +if __name__ == "__main__": + import multiprocessing + # Start the scrapers + scraping = multiprocessing.Process(target=scrapers) + scraping.start() + scraping.join(30*60) + + # log error if scraping is still running after 30 minutes + if scraping.is_alive(): + # create a random Id for this scrap instance + import random + scraper_id = random.randint(1, 100000) + error['MESSAGE'] = 'Scraper: {} is taking more than 30 minutes'.format(scraper_id) + scraper.print_error(error) From 987c24077d0292dc0f52d1e73f175d552928c0aa Mon Sep 17 00:00:00 2001 From: celelstine Date: Fri, 27 Oct 2017 11:18:26 +0100 Subject: [PATCH 3/4] send total time sent when scraping lasted for more than 30 minutes --- scraper.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/scraper.py b/scraper.py index 8b56515..f653690 100644 --- a/scraper.py +++ b/scraper.py @@ -74,6 +74,12 @@ def scrapers(): nhif_outpatient_result = nhif_outpatient_scraper.run_scraper() nhif_outpatient_cs_result = nhif_outpatient_cs_scraper.run_scraper() + # record end time + end_time = time.time() + response_time_in_minutes = (end_time - start_time) / (60) + if(response_time_in_minutes >= 30): + error['MESSAGE'] = 'Scraper: {} took about {} minutes'.format(scraper_id, response_time_in_minutes) + scraper.print_error(error) if __name__ == "__main__": import multiprocessing From 84b0e9f95b01cc6617e6818ee6aca8b345342d17 Mon Sep 17 00:00:00 2001 From: celelstine Date: Fri, 27 Oct 2017 11:30:55 +0100 Subject: [PATCH 4/4] change the index date format to match the date format changed in the web page --- healthtools/scrapers/clinical_officers.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/healthtools/scrapers/clinical_officers.py b/healthtools/scrapers/clinical_officers.py index 8c8cc4e..84b5e8d 100644 --- a/healthtools/scrapers/clinical_officers.py +++ b/healthtools/scrapers/clinical_officers.py @@ -26,9 +26,9 @@ def elasticsearch_format(self, entry): :return: dictionaries of the entry's metadata and the formatted entry """ try: - date_obj = datetime.strptime(entry["reg_date"], "%Y-%m-%d") + date_obj = datetime.strptime(entry["reg_date"], "%Y-%m-%d %H:%M") except: - date_obj = datetime.strptime(entry["reg_date"], "%d-%m-%Y") + date_obj = datetime.strptime(entry["reg_date"], "%d-%m-%Y %H:%M") entry["reg_date"] = datetime.strftime( date_obj, "%Y-%m-%dT%H:%M:%S.000Z") # all bulk data need meta data describing the data