How to use Boto3 to download all files from an S3 Bucket?
Introduction
AWS Boto3 is the Python SDK for AWS. Boto3 can be used to directly interact with AWS resources from Python scripts.
Boto3’s S3 API doesn’t have any method to download all of the files from your S3 bucket at once.
In this tutorial, we will look at how we can use the Boto3 library to download all the files from your S3 bucket.
Table of contents
Assumptions
In this tutorial, we will replicate the functionality of aws s3 sync
. We will create the same directory structure as the S3 bucket.
We will be using the following API methods from the Boto SDK:
list_objects_v2
: List all of the objects in our S3 bucket- Note: By default, this method will only return up to 1000 keys. We will be using the
ContinuationToken
to ensure we retrieve all the objects in your bucket.
- Note: By default, this method will only return up to 1000 keys. We will be using the
download_file
: Download the specified object
Retrieve the list of files & directories in S3 bucket
We will use the list_objects_v2
to retrieve all the objects from the bucket with the specified prefix.
We also check if there any remaining objects in the S3 bucket using the ContinuationToken
.
def get_file_folders(s3_client, bucket_name, prefix=""):
file_names = []
folders = []
default_kwargs = {
"Bucket": bucket_name,
"Prefix": prefix
}
next_token = ""
while next_token is not None:
updated_kwargs = default_kwargs.copy()
if next_token != "":
updated_kwargs["ContinuationToken"] = next_token
response = s3_client.list_objects_v2(**updated_kwargs)
contents = response.get("Contents")
for result in contents:
key = result.get("Key")
if key[-1] == "/":
folders.append(key)
else:
file_names.append(key)
next_token = response.get("NextContinuationToken")
return file_names, folders
Create folders & download files
Once we have the list of files and folders in our S3 bucket, we can first create the corresponding folders in our local path.
Next, we download one file at a time to our local path.
def download_files(s3_client, bucket_name, local_path, file_names, folders):
local_path = Path(local_path)
for folder in folders:
folder_path = Path.joinpath(local_path, folder)
# Create all folders in the path
folder_path.mkdir(parents=True, exist_ok=True)
for file_name in file_names:
file_path = Path.joinpath(local_path, file_name)
# Create folder for parent directory
file_path.parent.mkdir(parents=True, exist_ok=True)
s3_client.download_file(
bucket_name,
file_name,
str(file_path)
)
Tying it together
This is what the overall code looks and how to download files from S3
import boto3
from pathlib import Path
def get_file_folders(s3_client, bucket_name, prefix=""):
file_names = []
folders = []
default_kwargs = {
"Bucket": bucket_name,
"Prefix": prefix
}
next_token = ""
while next_token is not None:
updated_kwargs = default_kwargs.copy()
if next_token != "":
updated_kwargs["ContinuationToken"] = next_token
response = s3_client.list_objects_v2(**default_kwargs)
contents = response.get("Contents")
for result in contents:
key = result.get("Key")
if key[-1] == "/":
folders.append(key)
else:
file_names.append(key)
next_token = response.get("NextContinuationToken")
return file_names, folders
def download_files(s3_client, bucket_name, local_path, file_names, folders):
local_path = Path(local_path)
for folder in folders:
folder_path = Path.joinpath(local_path, folder)
folder_path.mkdir(parents=True, exist_ok=True)
for file_name in file_names:
file_path = Path.joinpath(local_path, file_name)
file_path.parent.mkdir(parents=True, exist_ok=True)
s3_client.download_file(
bucket_name,
file_name,
str(file_path)
)
def main():
client = boto3.client("s3")
file_names, folders = get_file_folders(client, "bucket-for-my-blog")
download_files(
client,
"bucket-for-my-blog",
"/home/abhishek/Projects/my-s3-folder",
file_names,
folders
)
if __name__ == "__main__":
main()