ETL using Python (Example 1)

We will do the following steps using Python

Read several CSV, JSON and XML file types.
Extract data from the above file types and combine it.
Transform data.
Save the transformed data in a ready-to-load format which data engineers can use to load into an RDBMS

Please Download the "SOURCE.zip" Zip file and extract it. Replace “C:\Nazmul\Blog\Python\ETL\source\” to the location where you have extracted the Zip File. The following Python script is attacehed "ETL script using Python (Example 1).py"

##Import the required modules and functions

import glob # this module helps in selecting files

import pandas as pd # this module helps in processing CSV files

import xml.etree.ElementTree as ET # this module helps in processing XML files.

from datetime import datetime

### Set Path

tmpfile = "temp.tmp" # file used to store all extracted data

logfile = "logfile.txt" # all event logs will be stored in this file

targetfile = "transformed_data.csv" # file where transformed data is stored

### CSV Extract Function

def extract_from_csv(file_to_process):

dataframe = pd.read_csv(file_to_process)

return dataframe

### JSON Extract Function

def extract_from_json(file_to_process):

dataframe = pd.read_json(file_to_process,lines=True)

return dataframe

### XML Extract Function

def extract_from_xml(file_to_process):

dataframe = pd.DataFrame(columns=["name", "height", "weight"])

tree = ET.parse(file_to_process)

root = tree.getroot()

for person in root:

name = person.find("name").text

height = float(person.find("height").text)

weight = float(person.find("weight").text)

dataframe = dataframe.append({"name":name, "height":height, "weight":weight},

ignore_index=True)

return dataframe

### Extract Function

def extract():

extracted_data = pd.DataFrame(columns=['name','height','weight']) # create an empty data

frame to hold extracted data

#process all csv files

for csvfile in glob.glob(r"C:\Nazmul\Blog\Python\ETL\source\*.csv"):

extracted_data = extracted_data.append(extract_from_csv(csvfile), ignore_index=True)

#process all json files

for jsonfile in glob.glob(r"C:\Nazmul\Blog\Python\ETL\source\*.json"):

extracted_data = extracted_data.append(extract_from_json(jsonfile), ignore_index=True)

#process all xml files

for xmlfile in glob.glob(r"C:\Nazmul\Blog\Python\ETL\source\*.xml"):

extracted_data = extracted_data.append(extract_from_xml(xmlfile), ignore_index=True)

return extracted_data

### Transform

#The transform function does the following tasks.

#1. Convert height which is in inches to millimeter

#2. Convert weight which is in pounds to kilograms

def transform(data):

#Convert height which is in inches to millimeter

#Convert the datatype of the column into float

#data.height = data.height.astype(float)

#Convert inches to meters and round off to two decimals(one inch is 0.0254 meters)

data['height'] = round(data.height * 0.0254,2)

#Convert weight which is in pounds to kilograms

#Convert the datatype of the column into float

#data.weight = data.weight.astype(float)

#Convert pounds to kilograms and round off to two decimals(one pound is 0.45359237

kilograms)

data['weight'] = round(data.weight * 0.45359237,2)

return data

### Loading

def load(targetfile,data_to_load):

data_to_load.to_csv(targetfile)

### Logging

def log(message):

timestamp_format = '%Y-%h-%d-%H:%M:%S' # Year-Monthname-Day-Hour-Minute-Second

now = datetime.now() # get current timestamp

timestamp = now.strftime(timestamp_format)

with open("logfile.txt","a") as f:

f.write(timestamp + ',' + message + '\n')

###Running ETL Process

log("ETL Job Started")

log("Extract phase Started")

extracted_data = extract()

log("Extract phase Ended")

extracted_data

log("Transform phase Started")

transformed_data = transform(extracted_data)

log("Transform phase Ended")

transformed_data

log("Load phase Started")

load(targetfile,transformed_data)

log("Load phase Ended")

log("ETL Job Ended")