core.download.cloud_5d7

View Source

import hashlib, urllib, json, os
from zipfile import ZipFile
import requests

class CorpusDownload():
	'''
		Corpus/ data set downloader for 5d7 Storage.
	'''

	INDEX_FILE = 'https://cloud.example.com/s/sometoken/download?path=%2Findex.json'
	HEADERS = {
		'User-Agent': ''
	}

	def __init__(self, destination):
		'''
			Args:
			destination (string): The base directory where the data will be stored
		'''
		if not os.path.isdir(destination):
			print("Destination is not a directory! Will create:", destination)
			os.mkdir(destination)
		
		self.no_file = True
		self.index = self._get_json_file(CorpusDownload.INDEX_FILE)
		self.destination = destination

	def _get_json_file(self, url):
		r = requests.get(url, headers=CorpusDownload.HEADERS)
		if r.status_code == 200:
			self.no_file = False
			return json.loads(r.content)
		else:
			print("Error connecting to 5d7 download, please download corpora manually to ./data/!")

	def request(self, name):
		'''
			Request a data set on disk. Will be downloaded and sha checked.
				(If already on disk only sha-check is done)
			Args:
				name (string): Name of the data set
		'''
		if self.no_file:
			print("Error connecting to 5d7 download, please download corpus '"+ name +"' manually to ./data/!")
			return 

		for key,files in self.index['structure']['data'].items():
			if key.lower() == name:
				if not os.path.isdir(os.path.join(self.destination, name)):
					print("No directory for", name, "-- create it")
					os.mkdir(os.path.join(self.destination, name))

				for f,h in files.items():
					self._assure_file(
						self.index['download'] + urllib.parse.quote_plus(os.path.join('data', key, f)),
						os.path.join(self.destination, name, f),
						h,
						os.path.join(self.destination, name)
					)

	def _assure_file(self, link, local, sha, path):
		if not os.path.isfile(local):
			self._get_huge_file(link, local)
		
		if not self._validate_sha(sha, local):
			self._get_huge_file(link, local)

			if not self._validate_sha(sha, local):
				print("Unable to get:", link)

		if local.endswith(".zip"):
			with ZipFile(local, 'r') as zf:

				need_unzip = False
				for name in zf.namelist():
					if name not in os.listdir(path):
						need_unzip = True

				if need_unzip:
					print("Unzipping data")
					zf.extractall(path)
			
	def _get_huge_file(self, link, local):
		print("Starting download to:", local)
		r = requests.get(link, stream=True, headers=CorpusDownload.HEADERS)
		with open(local, 'wb') as f:
			for b in r.iter_content(32 * 1024):
				f.write(b)

	def _validate_sha(self, sha, local):
		local_sha = hashlib.sha512()

		with open(local, 'rb') as f:
			while True:
				b = f.read(65536)
				if not b:
					break 
				local_sha.update(b)
		
		return local_sha.hexdigest() == sha

# class CorpusDownload:

View Source

class CorpusDownload():
	'''
		Corpus/ data set downloader for 5d7 Storage.
	'''

	INDEX_FILE = 'https://cloud.example.com/s/sometoken/download?path=%2Findex.json'
	HEADERS = {
		'User-Agent': ''
	}

	def __init__(self, destination):
		'''
			Args:
			destination (string): The base directory where the data will be stored
		'''
		if not os.path.isdir(destination):
			print("Destination is not a directory! Will create:", destination)
			os.mkdir(destination)
		
		self.no_file = True
		self.index = self._get_json_file(CorpusDownload.INDEX_FILE)
		self.destination = destination

	def _get_json_file(self, url):
		r = requests.get(url, headers=CorpusDownload.HEADERS)
		if r.status_code == 200:
			self.no_file = False
			return json.loads(r.content)
		else:
			print("Error connecting to 5d7 download, please download corpora manually to ./data/!")

	def request(self, name):
		'''
			Request a data set on disk. Will be downloaded and sha checked.
				(If already on disk only sha-check is done)
			Args:
				name (string): Name of the data set
		'''
		if self.no_file:
			print("Error connecting to 5d7 download, please download corpus '"+ name +"' manually to ./data/!")
			return 

		for key,files in self.index['structure']['data'].items():
			if key.lower() == name:
				if not os.path.isdir(os.path.join(self.destination, name)):
					print("No directory for", name, "-- create it")
					os.mkdir(os.path.join(self.destination, name))

				for f,h in files.items():
					self._assure_file(
						self.index['download'] + urllib.parse.quote_plus(os.path.join('data', key, f)),
						os.path.join(self.destination, name, f),
						h,
						os.path.join(self.destination, name)
					)

	def _assure_file(self, link, local, sha, path):
		if not os.path.isfile(local):
			self._get_huge_file(link, local)
		
		if not self._validate_sha(sha, local):
			self._get_huge_file(link, local)

			if not self._validate_sha(sha, local):
				print("Unable to get:", link)

		if local.endswith(".zip"):
			with ZipFile(local, 'r') as zf:

				need_unzip = False
				for name in zf.namelist():
					if name not in os.listdir(path):
						need_unzip = True

				if need_unzip:
					print("Unzipping data")
					zf.extractall(path)
			
	def _get_huge_file(self, link, local):
		print("Starting download to:", local)
		r = requests.get(link, stream=True, headers=CorpusDownload.HEADERS)
		with open(local, 'wb') as f:
			for b in r.iter_content(32 * 1024):
				f.write(b)

	def _validate_sha(self, sha, local):
		local_sha = hashlib.sha512()

		with open(local, 'rb') as f:
			while True:
				b = f.read(65536)
				if not b:
					break 
				local_sha.update(b)
		
		return local_sha.hexdigest() == sha

Corpus/ data set downloader for 5d7 Storage.

# CorpusDownload(destination)

View Source

	def __init__(self, destination):
		'''
			Args:
			destination (string): The base directory where the data will be stored
		'''
		if not os.path.isdir(destination):
			print("Destination is not a directory! Will create:", destination)
			os.mkdir(destination)
		
		self.no_file = True
		self.index = self._get_json_file(CorpusDownload.INDEX_FILE)
		self.destination = destination

Args: destination (string): The base directory where the data will be stored

# INDEX_FILE = 'https://cloud.example.com/s/sometoken/download?path=%2Findex.json'

# HEADERS = {'User-Agent': ''}

# def request(self, name):

View Source

	def request(self, name):
		'''
			Request a data set on disk. Will be downloaded and sha checked.
				(If already on disk only sha-check is done)
			Args:
				name (string): Name of the data set
		'''
		if self.no_file:
			print("Error connecting to 5d7 download, please download corpus '"+ name +"' manually to ./data/!")
			return 

		for key,files in self.index['structure']['data'].items():
			if key.lower() == name:
				if not os.path.isdir(os.path.join(self.destination, name)):
					print("No directory for", name, "-- create it")
					os.mkdir(os.path.join(self.destination, name))

				for f,h in files.items():
					self._assure_file(
						self.index['download'] + urllib.parse.quote_plus(os.path.join('data', key, f)),
						os.path.join(self.destination, name, f),
						h,
						os.path.join(self.destination, name)
					)

Request a data set on disk. Will be downloaded and sha checked. (If already on disk only sha-check is done)

Args

name (string): Name of the data set