diff --git a/.gitignore b/.gitignore index c3842d1..0628d1c 100644 --- a/.gitignore +++ b/.gitignore @@ -6,3 +6,7 @@ .*env venv *.DS_Store + +# visual studio code config +.vscode + diff --git a/healthtools/scrapers/clinical_officers.py b/healthtools/scrapers/clinical_officers.py index 8c8cc4e..84b5e8d 100644 --- a/healthtools/scrapers/clinical_officers.py +++ b/healthtools/scrapers/clinical_officers.py @@ -26,9 +26,9 @@ def elasticsearch_format(self, entry): :return: dictionaries of the entry's metadata and the formatted entry """ try: - date_obj = datetime.strptime(entry["reg_date"], "%Y-%m-%d") + date_obj = datetime.strptime(entry["reg_date"], "%Y-%m-%d %H:%M") except: - date_obj = datetime.strptime(entry["reg_date"], "%d-%m-%Y") + date_obj = datetime.strptime(entry["reg_date"], "%d-%m-%Y %H:%M") entry["reg_date"] = datetime.strftime( date_obj, "%Y-%m-%dT%H:%M:%S.000Z") # all bulk data need meta data describing the data diff --git a/scraper.py b/scraper.py index a1adfa5..f653690 100644 --- a/scraper.py +++ b/scraper.py @@ -9,10 +9,24 @@ logging.basicConfig(level=logging.INFO) +import time +scraper_id = 0 +# error message for +error = { + "ERROR": "scrapers()", + "SOURCE": "Scraper time tracker", + "MESSAGE": "" +} +# create a scrapper to log error message +scraper = Scraper() -if __name__ == "__main__": - +def scrapers(): + ''' + Function to run every scraper + ''' + # record the start time + start_time = time.time() # Initialize the Scrapers doctors_scraper = DoctorsScraper() foreign_doctors_scraper = ForeignDoctorsScraper() @@ -59,3 +73,25 @@ nhif_inpatient_result = nhif_inpatient_scraper.run_scraper() nhif_outpatient_result = nhif_outpatient_scraper.run_scraper() nhif_outpatient_cs_result = nhif_outpatient_cs_scraper.run_scraper() + + # record end time + end_time = time.time() + response_time_in_minutes = (end_time - start_time) / (60) + if(response_time_in_minutes >= 30): + error['MESSAGE'] = 'Scraper: {} took about {} minutes'.format(scraper_id, response_time_in_minutes) + scraper.print_error(error) + +if __name__ == "__main__": + import multiprocessing + # Start the scrapers + scraping = multiprocessing.Process(target=scrapers) + scraping.start() + scraping.join(30*60) + + # log error if scraping is still running after 30 minutes + if scraping.is_alive(): + # create a random Id for this scrap instance + import random + scraper_id = random.randint(1, 100000) + error['MESSAGE'] = 'Scraper: {} is taking more than 30 minutes'.format(scraper_id) + scraper.print_error(error)