Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 13 additions & 9 deletions install/helioviewer/hvpull/browser/httpbrowser.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,9 +22,9 @@ def read(self, uri):
usock = urllib.request.urlopen(uri)
self.feed(usock.read().decode(usock.headers.get_content_charset()))
usock.close()

return self.urls

def reset(self):
"""Reset state of URLLister"""
HTMLParser.reset(self)
Expand Down Expand Up @@ -58,7 +58,7 @@ def read(self, uri):
print (e)

return self.urls

def reset(self):
"""Reset state of URLLister"""
SGMLParser.reset(self)
Expand All @@ -73,25 +73,29 @@ class HTTPDataBrowser(BaseDataBrowser):
def __init__(self, server):
BaseDataBrowser.__init__(self, server)
socket.setdefaulttimeout(60)

def get_directories(self, start_date, end_date):
"""Generates a list of remote directories which may be queried
for files corresponding to the requested range. Note that these
directories do not necessarily exist on the remote server."""
# filter(lambda url: url.endswith("/"), self._query(location))
return self.server.compute_directories(start_date, end_date)

def get_files(self, location, extension):
def get_files(self, location, extension, filter_func: callable | None = None):
"""Get all the files that end with specified extension at the uri"""
files = None
num_retries = 0

# Get a list of the files at the remote location, if it exists
# To avoid spending too much time, we will timeout after a short time
# and retry up to 10 times.
while files is None and num_retries <= 10:
try:
# Only grab files with the matching file extension
files = filter(lambda url: url.endswith("." + extension), self._query(location))
# If there is a user-defined filter function, use that to only get those specific files.
if filter_func is not None:
files = filter(filter_func, files)
except IOError as e:
if isinstance(e.strerror, socket.error):
# if server is unreachable, raise an exception
Expand All @@ -105,10 +109,10 @@ def get_files(self, location, extension):
files = []

return files

def _query(self, location):
"""Get a list of files and folders at the specified remote location"""
# query the remote location for the list of files and subdirectories
# query the remote location for the list of files and subdirectories

if (sys.version_info >= (3, 0)):
url_lister = URLLister()
Expand All @@ -121,4 +125,4 @@ def _query(self, location):
urls = filter(lambda url: url[0] != "/" and url[0] != "?", result)

return [os.path.join(location, url) for url in urls]

13 changes: 9 additions & 4 deletions install/helioviewer/hvpull/net/daemon.py
Original file line number Diff line number Diff line change
Expand Up @@ -174,7 +174,7 @@ def start(self, starttime=None, endtime=None, backfill=None):
# get a list of files available
# self.oldest_timestamp gets set by query() during the first run
# before the main loop.
self.query(starttime, now)
self.query(self.oldest_timestamp, now)

self.sleep()

Expand All @@ -201,6 +201,8 @@ def query(self, starttime, endtime):
if any new files have appeared since the first execution. This continues
until no new files are found (for xxx minutes?)
"""
if (starttime > endtime):
raise ValueError(f"Start Time {starttime} is ahead of End Time {endtime}. No files would be downloaded.")
urls = []

fmt = '%Y-%m-%d %H:%M:%S'
Expand Down Expand Up @@ -241,6 +243,7 @@ def query(self, starttime, endtime):
try:
# Filter by time range
filtered = self._filter_files_by_time(url_list, starttime, endtime)
# Filter to only download new files that have not already been downloaded previously.
filtered = list(filter(self._filter_new, filtered))
except mysqld.OperationalError:
# MySQL has gone away -- try again in 5s
Expand Down Expand Up @@ -322,11 +325,13 @@ def query(self, starttime, endtime):
if self.servers[0].name in ['LMSAL2']:
new_urls.append(extra_filtered)
if len(extra_filtered) > 0:
self.oldest_timestamp = self._get_oldest_image(extra_filtered)
# Using max(starttime, ...) so oldest_timestamp never goes earlier than the initial requested starttime
self.oldest_timestamp = max(starttime, self._get_oldest_image(extra_filtered))
else:
new_urls.append(filtered)
if len(filtered) > 0:
self.oldest_timestamp = self._get_oldest_image(filtered)
# Using max(starttime, ...) so oldest_timestamp never goes earlier than the initial requested starttime
self.oldest_timestamp = max(starttime, self._get_oldest_image(filtered))

# check disk space
if not self.sent_diskspace_warning:
Expand Down Expand Up @@ -421,7 +426,7 @@ def query_server(self, browser, starttime, endtime):
return []

try:
matches = browser.get_files(directory, "jp2")
matches = browser.get_files(directory, "jp2", browser.server.filter)

files.extend(matches)
except NetworkError:
Expand Down
18 changes: 8 additions & 10 deletions install/helioviewer/hvpull/servers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,10 +68,13 @@ def get_dates(self, starttime, endtime):

return dates

def get_file_regex(self):
"""Returns a regex which described the expected format of filenames on
the server"""
return self.filename_regex
def filter(self, file: str) -> bool:
"""
Returns True if the file should be downloaded, otherwise False.
This may be overridden by specific Data Servers to only download
specific files from the upstream directory
"""
return True

def get_measurements(self, nicknames, dates):
"""Get a list of all the URIs down to the measurement"""
Expand All @@ -85,7 +88,7 @@ def get_datetime_from_file(self, filename):
return get_datetime_from_file(filename)


class DataServerPauseDelayDefinesDefaultStartTime:
class DataServerPauseDelayDefinesDefaultStartTime(DataServer):
"""Class for interacting with data servers. In this class the
pause defines the default start time. If real time is UTC, then
the default start time is UTC - pause minutes."""
Expand Down Expand Up @@ -126,11 +129,6 @@ def get_dates(self, starttime, endtime):

return dates

def get_file_regex(self):
"""Returns a regex which described the expected format of filenames on
the server"""
return self.filename_regex

def get_measurements(self, nicknames, dates):
"""Get a list of all the URIs down to the measurement"""
return None
Expand Down
Loading