core.download.cloud_5d7
View Source
import hashlib, urllib, json, os from zipfile import ZipFile import requests class CorpusDownload(): ''' Corpus/ data set downloader for 5d7 Storage. ''' INDEX_FILE = 'https://cloud.example.com/s/sometoken/download?path=%2Findex.json' HEADERS = { 'User-Agent': '' } def __init__(self, destination): ''' Args: destination (string): The base directory where the data will be stored ''' if not os.path.isdir(destination): print("Destination is not a directory! Will create:", destination) os.mkdir(destination) self.no_file = True self.index = self._get_json_file(CorpusDownload.INDEX_FILE) self.destination = destination def _get_json_file(self, url): r = requests.get(url, headers=CorpusDownload.HEADERS) if r.status_code == 200: self.no_file = False return json.loads(r.content) else: print("Error connecting to 5d7 download, please download corpora manually to ./data/!") def request(self, name): ''' Request a data set on disk. Will be downloaded and sha checked. (If already on disk only sha-check is done) Args: name (string): Name of the data set ''' if self.no_file: print("Error connecting to 5d7 download, please download corpus '"+ name +"' manually to ./data/!") return for key,files in self.index['structure']['data'].items(): if key.lower() == name: if not os.path.isdir(os.path.join(self.destination, name)): print("No directory for", name, "-- create it") os.mkdir(os.path.join(self.destination, name)) for f,h in files.items(): self._assure_file( self.index['download'] + urllib.parse.quote_plus(os.path.join('data', key, f)), os.path.join(self.destination, name, f), h, os.path.join(self.destination, name) ) def _assure_file(self, link, local, sha, path): if not os.path.isfile(local): self._get_huge_file(link, local) if not self._validate_sha(sha, local): self._get_huge_file(link, local) if not self._validate_sha(sha, local): print("Unable to get:", link) if local.endswith(".zip"): with ZipFile(local, 'r') as zf: need_unzip = False for name in zf.namelist(): if name not in os.listdir(path): need_unzip = True if need_unzip: print("Unzipping data") zf.extractall(path) def _get_huge_file(self, link, local): print("Starting download to:", local) r = requests.get(link, stream=True, headers=CorpusDownload.HEADERS) with open(local, 'wb') as f: for b in r.iter_content(32 * 1024): f.write(b) def _validate_sha(self, sha, local): local_sha = hashlib.sha512() with open(local, 'rb') as f: while True: b = f.read(65536) if not b: break local_sha.update(b) return local_sha.hexdigest() == sha
View Source
class CorpusDownload(): ''' Corpus/ data set downloader for 5d7 Storage. ''' INDEX_FILE = 'https://cloud.example.com/s/sometoken/download?path=%2Findex.json' HEADERS = { 'User-Agent': '' } def __init__(self, destination): ''' Args: destination (string): The base directory where the data will be stored ''' if not os.path.isdir(destination): print("Destination is not a directory! Will create:", destination) os.mkdir(destination) self.no_file = True self.index = self._get_json_file(CorpusDownload.INDEX_FILE) self.destination = destination def _get_json_file(self, url): r = requests.get(url, headers=CorpusDownload.HEADERS) if r.status_code == 200: self.no_file = False return json.loads(r.content) else: print("Error connecting to 5d7 download, please download corpora manually to ./data/!") def request(self, name): ''' Request a data set on disk. Will be downloaded and sha checked. (If already on disk only sha-check is done) Args: name (string): Name of the data set ''' if self.no_file: print("Error connecting to 5d7 download, please download corpus '"+ name +"' manually to ./data/!") return for key,files in self.index['structure']['data'].items(): if key.lower() == name: if not os.path.isdir(os.path.join(self.destination, name)): print("No directory for", name, "-- create it") os.mkdir(os.path.join(self.destination, name)) for f,h in files.items(): self._assure_file( self.index['download'] + urllib.parse.quote_plus(os.path.join('data', key, f)), os.path.join(self.destination, name, f), h, os.path.join(self.destination, name) ) def _assure_file(self, link, local, sha, path): if not os.path.isfile(local): self._get_huge_file(link, local) if not self._validate_sha(sha, local): self._get_huge_file(link, local) if not self._validate_sha(sha, local): print("Unable to get:", link) if local.endswith(".zip"): with ZipFile(local, 'r') as zf: need_unzip = False for name in zf.namelist(): if name not in os.listdir(path): need_unzip = True if need_unzip: print("Unzipping data") zf.extractall(path) def _get_huge_file(self, link, local): print("Starting download to:", local) r = requests.get(link, stream=True, headers=CorpusDownload.HEADERS) with open(local, 'wb') as f: for b in r.iter_content(32 * 1024): f.write(b) def _validate_sha(self, sha, local): local_sha = hashlib.sha512() with open(local, 'rb') as f: while True: b = f.read(65536) if not b: break local_sha.update(b) return local_sha.hexdigest() == sha
Corpus/ data set downloader for 5d7 Storage.
View Source
def __init__(self, destination): ''' Args: destination (string): The base directory where the data will be stored ''' if not os.path.isdir(destination): print("Destination is not a directory! Will create:", destination) os.mkdir(destination) self.no_file = True self.index = self._get_json_file(CorpusDownload.INDEX_FILE) self.destination = destination
Args: destination (string): The base directory where the data will be stored
View Source
def request(self, name): ''' Request a data set on disk. Will be downloaded and sha checked. (If already on disk only sha-check is done) Args: name (string): Name of the data set ''' if self.no_file: print("Error connecting to 5d7 download, please download corpus '"+ name +"' manually to ./data/!") return for key,files in self.index['structure']['data'].items(): if key.lower() == name: if not os.path.isdir(os.path.join(self.destination, name)): print("No directory for", name, "-- create it") os.mkdir(os.path.join(self.destination, name)) for f,h in files.items(): self._assure_file( self.index['download'] + urllib.parse.quote_plus(os.path.join('data', key, f)), os.path.join(self.destination, name, f), h, os.path.join(self.destination, name) )
Request a data set on disk. Will be downloaded and sha checked. (If already on disk only sha-check is done)
Args
- name (string): Name of the data set