From 12c23e2b5fc23e7bf89d5937c229be13e6dc5823 Mon Sep 17 00:00:00 2001 From: andela_ookoro Date: Thu, 23 Nov 2017 10:02:13 +0100 Subject: [PATCH] Slack notification after 30min of scraping --- healthtools/config.py | 25 +++---------------------- requirements.txt | 4 ++-- runtime.txt | 2 +- scraper.py | 35 +++++++++++++++++++++++++++++++++-- 4 files changed, 39 insertions(+), 27 deletions(-) diff --git a/healthtools/config.py b/healthtools/config.py index e0db0ed..7125f88 100644 --- a/healthtools/config.py +++ b/healthtools/config.py @@ -37,7 +37,7 @@ # sites to be scraped SITES = { - "DOCTORS": "http://medicalboard.co.ke/online-services/retention/?currpage={}", + "DOCTORS": "https://medicalboard.co.ke/online-services/retention/?currpage={}", "FOREIGN_DOCTORS": "http://medicalboard.co.ke/online-services/foreign-doctors-license-register/?currpage={}", "CLINICAL_OFFICERS": "http://clinicalofficerscouncil.org/online-services/retention/?currpage={}", "TOKEN_URL": "http://api.kmhfl.health.go.ke/o/token/", @@ -49,7 +49,6 @@ NHIF_SERVICES = ["inpatient", "outpatient", "outpatient-cs"] # config logging - LOGGING = { "version": 1, "disable_existing_loggers": False, @@ -57,38 +56,20 @@ "simple": { "format": "%(asctime)s - %(name)s - %(levelname)s - %(message)s", "datefmt": "%Y-%m-%d %H:%M:%S" - }, - "slack": { - "format": """ - Location: %(module)s: %(funcName)s: %(lineno)d \n Time: %(asctime)s \n Message: %(message)s - """, - "datefmt": "%Y-%m-%d %H:%M:%S" } }, - "handlers": { "console": { "class": "logging.StreamHandler", "level": "DEBUG", - "formatter": "simple", "stream": "ext://sys.stdout" - }, - - "slack_log": { - "level": "WARNING", - "api_key": os.getenv('SLACK_API_TOKEN', None), - "class": "slacker_log_handler.SlackerLogHandler", - "channel": os.getenv('SLACK_LOGGER_CHANNEL', None), - "username": "Scrapper Slack Logger", - "stack_trace": True, - "formatter": "slack", - "fail_silent": True #would not raise an error when api token is invalid } }, "root": { "level": "INFO", - "handlers": ["console","slack_log"] + "handlers": ["console"] } } + diff --git a/requirements.txt b/requirements.txt index dc69fd3..3f463e3 100644 --- a/requirements.txt +++ b/requirements.txt @@ -19,14 +19,14 @@ nose==1.3.7 packaging==16.8 pyparsing==2.2.0 python-dateutil==2.6.0 -python-memcached==1.58 +python3-memcached== 1.51 requests==2.13.0 requests-aws4auth==0.9 s3transfer==0.1.10 six==1.10.0 +slack-logger==0.2.0 slackclient==1.0.6 slacker==0.9.42 -slacker-log-handler==1.6.1 termcolor==1.1.0 urllib3==1.21.1 websocket-client==0.40.0 diff --git a/runtime.txt b/runtime.txt index 4b38fc9..cfa5aa5 100644 --- a/runtime.txt +++ b/runtime.txt @@ -1 +1 @@ -python-2.7.13 +python-3.6.2 diff --git a/scraper.py b/scraper.py index f004162..de6a68b 100644 --- a/scraper.py +++ b/scraper.py @@ -22,9 +22,25 @@ def setup_logging(default_level=logging.INFO): logging.config.dictConfig(LOGGING) except Exception as ex: logging.basicConfig(level=default_level) - -if __name__ == "__main__": + # add slack log handler + SLACK_URL = os.getenv("MORPH_WEBHOOK_URL", None) + if SLACK_URL: + from slack_logger import SlackHandler, SlackFormatter + log.setLevel(logging.WARNING) + try: + sh = SlackHandler(username='Scraper Logger', url=SLACK_URL) + sh.setLevel(logging.WARNING) + f = SlackFormatter() + sh.setFormatter(f) + log.addHandler(sh) + except Exception as ex: + log.error('Unable to add slack_logger', str(ex)) + +def scrapers(): + ''' + Function to run every scraper + ''' # Initialize the Scrapers doctors_scraper = DoctorsScraper() foreign_doctors_scraper = ForeignDoctorsScraper() @@ -97,3 +113,18 @@ def setup_logging(default_level=logging.INFO): scraper_stats.data_archive_key = "stats/stats-{}.json" scraper_stats.archive_data(json.dumps(scraping_statistics)) + # log warning when scraper ran more than 30 minutes + if(m >= 30): + log.warning('Scraper: {} ran for about {}'.format(scraper_id, time_taken)) + +if __name__ == "__main__": + setup_logging() + import multiprocessing + # Start the scrapers + scraping = multiprocessing.Process(target=scrapers) + scraping.start() + scraping.join(30 * 60) + + # log error if scraping is still running after 30 minutes + if scraping.is_alive(): + log.warning('Scraper: {} is running for more than 30 minutes'.format(scraper_id))