Azure Data Lake Storage Gen 2 is a popular data storage system from Microsoft. I was in a need to download a complete folder / directory recursively from ADLS to local disk in an automated way.

Finally I ended up in writing a sample utility for the same. I have used the Azure Blob API to perform the recursive download of the files from Azure.
The below program will recursively download a directory from ADLS to Local. Modify the connection string, container name, source directory and target directory parameters in the below program.
This program needs the following python package. Install the package using the following command.
pip install azure-storage-blob
Python Program
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# coding: utf-8 | |
import os | |
from azure.storage.blob import BlobServiceClient | |
class DownloadADLS: | |
def __init__(self, connection_string, container_name): | |
service_client = BlobServiceClient.from_connection_string(connection_string) | |
self.client = service_client.get_container_client(container_name) | |
def download(self, source, dest): | |
''' | |
Download a file or directory to a path on the local filesystem | |
''' | |
if not dest: | |
raise Exception('A destination must be provided') | |
blobs = self.ls_files(source, recursive=True) | |
if blobs: | |
# if source is a directory, dest must also be a directory | |
if not source == '' and not source.endswith('/'): | |
source += '/' | |
if not dest.endswith('/'): | |
dest += '/' | |
# append the directory name from source to the destination | |
dest += os.path.basename(os.path.normpath(source)) + '/' | |
blobs = [source + blob for blob in blobs] | |
for blob in blobs: | |
blob_dest = dest + os.path.relpath(blob, source) | |
self.download_file(blob, blob_dest) | |
else: | |
self.download_file(source, dest) | |
def download_file(self, source, dest): | |
''' | |
Download a single file to a path on the local filesystem | |
''' | |
# dest is a directory if ending with '/' or '.', otherwise it's a file | |
if dest.endswith('.'): | |
dest += '/' | |
blob_dest = dest + os.path.basename(source) if dest.endswith('/') else dest | |
print(f'Downloading {source} to {blob_dest}') | |
os.makedirs(os.path.dirname(blob_dest), exist_ok=True) | |
bc = self.client.get_blob_client(blob=source) | |
with open(blob_dest, 'wb') as file: | |
data = bc.download_blob() | |
file.write(data.readall()) | |
def ls_files(self, path, recursive=False): | |
''' | |
List files under a path, optionally recursively | |
''' | |
if not path == '' and not path.endswith('/'): | |
path += '/' | |
blob_iter = self.client.list_blobs(name_starts_with=path) | |
files = [] | |
for blob in blob_iter: | |
relative_path = os.path.relpath(blob.name, path) | |
if recursive or not '/' in relative_path: | |
files.append(relative_path) | |
return files | |
def ls_dirs(self, path, recursive=False): | |
''' | |
List directories under a path, optionally recursively | |
''' | |
if not path == '' and not path.endswith('/'): | |
path += '/' | |
blob_iter = self.client.list_blobs(name_starts_with=path) | |
dirs = [] | |
for blob in blob_iter: | |
relative_dir = os.path.dirname(os.path.relpath(blob.name, path)) | |
if relative_dir and (recursive or not '/' in relative_dir) and not relative_dir in dirs: | |
dirs.append(relative_dir) | |
return dirs | |
if __name__ == '__main__': | |
CONNECTION_STRING = "" | |
CONTAINER_NAME = "" | |
client = DownloadADLS(CONNECTION_STRING, CONTAINER_NAME) | |
client.download(source="", dest="") |
Advertisement