There are multiple approaches to split a large file into multiple small files. I am explaining two approaches in this article.

split_file_using_linux_command_python

Approach 1: Using split command

Using split command in Linux.

The syntax is given below. The command will split the files into multiple small files each with 2000 lines. You can change the line count depending upon your requirement. This is a utility available in Linux OS. The syntax is given below.

split -l <number of lines per file> <file name>

Example command
split -l 2000 data.csv

Approach 2: Using python program

The python program to split a large csv file into smaller csv files is given below.


import os
import json
import pandas as pd
def data_extractor(file_path, delimiter, required_fields=[]):
"""
:param file_path:
:param delimiter:
:param required_fields:
:return:
"""
if len(required_fields) > 0:
df = pd.read_csv(file_path, sep=delimiter, usecols=required_fields)
else:
df = pd.read_csv(file_path, sep=delimiter)
data_list = df.to_dict('records')
print("Record Count —>", len(data_list))
return data_list
def divide_chunks(l, n):
"""
:param l: list
:param n: number of splits
:return: list of smaller lists
"""
# looping till length l
for i in range(0, len(l), n):
yield l[i:i + n]
def split_writer(list_of_lists, output_dir, file_prefix="data_"):
"""
Function Description
:param list_of_lists:
:param output_dir:
:param file_prefix:
:return:
"""
i = 0
for each_list in list_of_lists:
f = pd.DataFrame(each_list)
data_prefix = os.path.join(output_dir, file_prefix)
fw = open(data_prefix + str(i) + ".csv", "w", encoding='utf-8')
fw.write(json.dumps(f))
fw.close()
i += 1
print("Total number of file splits –>", i+1)
if __name__ == '__main__':
file_path = 'large_data.csv'
# specify the required fields to extract from the file.
# You can keep this empty if you want to consider all the fields
required_fields = []
# specify the delimiter
delimiter = "\t"
# Number of records per file
number_of_records_per_file = 2000
# Output directory
out_dir = "outdir"
d_list = data_extractor(file_path, delimiter, required_fields)
list_of_lists = list(divide_chunks(d_list, number_of_records_per_file))
split_writer(list_of_lists,out_dir)

Advertisement