ETL using Python (Example 1)
We will do the following steps using Python
Read several CSV, JSON and XML file types.
Extract data from the above file types and combine it.
Transform data.
Save the transformed data in a ready-to-load format which data engineers can use to load into an RDBMS
Please Download the "SOURCE.zip" Zip file and extract it. Replace “C:\Nazmul\Blog\Python\ETL\source\” to the location where you have extracted the Zip File. The following Python script is attacehed "ETL script using Python (Example 1).py"
##Import the required modules and functions
import glob # this module helps in selecting files
import pandas as pd # this module helps in processing CSV files
import xml.etree.ElementTree as ET # this module helps in processing XML files.
from datetime import datetime
### Set Path
tmpfile = "temp.tmp" # file used to store all extracted data
logfile = "logfile.txt" # all event logs will be stored in this file
targetfile = "transformed_data.csv" # file where transformed data is stored
### CSV Extract Function
def extract_from_csv(file_to_process):
dataframe = pd.read_csv(file_to_process)
return dataframe
### JSON Extract Function
def extract_from_json(file_to_process):
dataframe = pd.read_json(file_to_process,lines=True)
return dataframe
### XML Extract Function
def extract_from_xml(file_to_process):
dataframe = pd.DataFrame(columns=["name", "height", "weight"])
tree = ET.parse(file_to_process)
root = tree.getroot()
for person in root:
name = person.find("name").text
height = float(person.find("height").text)
weight = float(person.find("weight").text)
dataframe = dataframe.append({"name":name, "height":height, "weight":weight},
ignore_index=True)
return dataframe
### Extract Function
def extract():
extracted_data = pd.DataFrame(columns=['name','height','weight']) # create an empty data
frame to hold extracted data
#process all csv files
for csvfile in glob.glob(r"C:\Nazmul\Blog\Python\ETL\source\*.csv"):
extracted_data = extracted_data.append(extract_from_csv(csvfile), ignore_index=True)
#process all json files
for jsonfile in glob.glob(r"C:\Nazmul\Blog\Python\ETL\source\*.json"):
extracted_data = extracted_data.append(extract_from_json(jsonfile), ignore_index=True)
#process all xml files
for xmlfile in glob.glob(r"C:\Nazmul\Blog\Python\ETL\source\*.xml"):
extracted_data = extracted_data.append(extract_from_xml(xmlfile), ignore_index=True)
return extracted_data
### Transform
#The transform function does the following tasks.
#1. Convert height which is in inches to millimeter
#2. Convert weight which is in pounds to kilograms
def transform(data):
#Convert height which is in inches to millimeter
#Convert the datatype of the column into float
#data.height = data.height.astype(float)
#Convert inches to meters and round off to two decimals(one inch is 0.0254 meters)
data['height'] = round(data.height * 0.0254,2)
#Convert weight which is in pounds to kilograms
#Convert the datatype of the column into float
#data.weight = data.weight.astype(float)
#Convert pounds to kilograms and round off to two decimals(one pound is 0.45359237
kilograms)
data['weight'] = round(data.weight * 0.45359237,2)
return data
### Loading
def load(targetfile,data_to_load):
data_to_load.to_csv(targetfile)
### Logging
def log(message):
timestamp_format = '%Y-%h-%d-%H:%M:%S' # Year-Monthname-Day-Hour-Minute-Second
now = datetime.now() # get current timestamp
timestamp = now.strftime(timestamp_format)
with open("logfile.txt","a") as f:
f.write(timestamp + ',' + message + '\n')
###Running ETL Process
log("ETL Job Started")
log("Extract phase Started")
extracted_data = extract()
log("Extract phase Ended")
extracted_data
log("Transform phase Started")
transformed_data = transform(extracted_data)
log("Transform phase Ended")
transformed_data
log("Load phase Started")
load(targetfile,transformed_data)
log("Load phase Ended")
log("ETL Job Ended")