diff --git a/install/helioviewer/hvpull/browser/httpbrowser.py b/install/helioviewer/hvpull/browser/httpbrowser.py index d235f3476..88b3df308 100644 --- a/install/helioviewer/hvpull/browser/httpbrowser.py +++ b/install/helioviewer/hvpull/browser/httpbrowser.py @@ -22,9 +22,9 @@ def read(self, uri): usock = urllib.request.urlopen(uri) self.feed(usock.read().decode(usock.headers.get_content_charset())) usock.close() - + return self.urls - + def reset(self): """Reset state of URLLister""" HTMLParser.reset(self) @@ -58,7 +58,7 @@ def read(self, uri): print (e) return self.urls - + def reset(self): """Reset state of URLLister""" SGMLParser.reset(self) @@ -73,7 +73,7 @@ class HTTPDataBrowser(BaseDataBrowser): def __init__(self, server): BaseDataBrowser.__init__(self, server) socket.setdefaulttimeout(60) - + def get_directories(self, start_date, end_date): """Generates a list of remote directories which may be queried for files corresponding to the requested range. Note that these @@ -81,17 +81,21 @@ def get_directories(self, start_date, end_date): # filter(lambda url: url.endswith("/"), self._query(location)) return self.server.compute_directories(start_date, end_date) - def get_files(self, location, extension): + def get_files(self, location, extension, filter_func: callable | None = None): """Get all the files that end with specified extension at the uri""" files = None num_retries = 0 - + # Get a list of the files at the remote location, if it exists # To avoid spending too much time, we will timeout after a short time # and retry up to 10 times. while files is None and num_retries <= 10: try: + # Only grab files with the matching file extension files = filter(lambda url: url.endswith("." + extension), self._query(location)) + # If there is a user-defined filter function, use that to only get those specific files. + if filter_func is not None: + files = filter(filter_func, files) except IOError as e: if isinstance(e.strerror, socket.error): # if server is unreachable, raise an exception @@ -105,10 +109,10 @@ def get_files(self, location, extension): files = [] return files - + def _query(self, location): """Get a list of files and folders at the specified remote location""" - # query the remote location for the list of files and subdirectories + # query the remote location for the list of files and subdirectories if (sys.version_info >= (3, 0)): url_lister = URLLister() @@ -121,4 +125,4 @@ def _query(self, location): urls = filter(lambda url: url[0] != "/" and url[0] != "?", result) return [os.path.join(location, url) for url in urls] - + diff --git a/install/helioviewer/hvpull/net/daemon.py b/install/helioviewer/hvpull/net/daemon.py index f7504bd73..336f71f0d 100644 --- a/install/helioviewer/hvpull/net/daemon.py +++ b/install/helioviewer/hvpull/net/daemon.py @@ -174,7 +174,7 @@ def start(self, starttime=None, endtime=None, backfill=None): # get a list of files available # self.oldest_timestamp gets set by query() during the first run # before the main loop. - self.query(starttime, now) + self.query(self.oldest_timestamp, now) self.sleep() @@ -201,6 +201,8 @@ def query(self, starttime, endtime): if any new files have appeared since the first execution. This continues until no new files are found (for xxx minutes?) """ + if (starttime > endtime): + raise ValueError(f"Start Time {starttime} is ahead of End Time {endtime}. No files would be downloaded.") urls = [] fmt = '%Y-%m-%d %H:%M:%S' @@ -241,6 +243,7 @@ def query(self, starttime, endtime): try: # Filter by time range filtered = self._filter_files_by_time(url_list, starttime, endtime) + # Filter to only download new files that have not already been downloaded previously. filtered = list(filter(self._filter_new, filtered)) except mysqld.OperationalError: # MySQL has gone away -- try again in 5s @@ -322,11 +325,13 @@ def query(self, starttime, endtime): if self.servers[0].name in ['LMSAL2']: new_urls.append(extra_filtered) if len(extra_filtered) > 0: - self.oldest_timestamp = self._get_oldest_image(extra_filtered) + # Using max(starttime, ...) so oldest_timestamp never goes earlier than the initial requested starttime + self.oldest_timestamp = max(starttime, self._get_oldest_image(extra_filtered)) else: new_urls.append(filtered) if len(filtered) > 0: - self.oldest_timestamp = self._get_oldest_image(filtered) + # Using max(starttime, ...) so oldest_timestamp never goes earlier than the initial requested starttime + self.oldest_timestamp = max(starttime, self._get_oldest_image(filtered)) # check disk space if not self.sent_diskspace_warning: @@ -421,7 +426,7 @@ def query_server(self, browser, starttime, endtime): return [] try: - matches = browser.get_files(directory, "jp2") + matches = browser.get_files(directory, "jp2", browser.server.filter) files.extend(matches) except NetworkError: diff --git a/install/helioviewer/hvpull/servers/__init__.py b/install/helioviewer/hvpull/servers/__init__.py index 5c871dd50..8a099e2a7 100644 --- a/install/helioviewer/hvpull/servers/__init__.py +++ b/install/helioviewer/hvpull/servers/__init__.py @@ -68,10 +68,13 @@ def get_dates(self, starttime, endtime): return dates - def get_file_regex(self): - """Returns a regex which described the expected format of filenames on - the server""" - return self.filename_regex + def filter(self, file: str) -> bool: + """ + Returns True if the file should be downloaded, otherwise False. + This may be overridden by specific Data Servers to only download + specific files from the upstream directory + """ + return True def get_measurements(self, nicknames, dates): """Get a list of all the URIs down to the measurement""" @@ -85,7 +88,7 @@ def get_datetime_from_file(self, filename): return get_datetime_from_file(filename) -class DataServerPauseDelayDefinesDefaultStartTime: +class DataServerPauseDelayDefinesDefaultStartTime(DataServer): """Class for interacting with data servers. In this class the pause defines the default start time. If real time is UTC, then the default start time is UTC - pause minutes.""" @@ -126,11 +129,6 @@ def get_dates(self, starttime, endtime): return dates - def get_file_regex(self): - """Returns a regex which described the expected format of filenames on - the server""" - return self.filename_regex - def get_measurements(self, nicknames, dates): """Get a list of all the URIs down to the measurement""" return None