ETL using Python (Example 1)

We will do the following steps using Python

  • Read several CSV, JSON and XML file types.

  • Extract data from the above file types and combine it.

  • Transform data.

  • Save the transformed data in a ready-to-load format which data engineers can use to load into an RDBMS

Please Download the "SOURCE.zip" Zip file and extract it. Replace “C:\Nazmul\Blog\Python\ETL\source\” to the location where you have extracted the Zip File. The following Python script is attacehed "ETL script using Python (Example 1).py"

##Import the required modules and functions

import glob # this module helps in selecting files

import pandas as pd # this module helps in processing CSV files

import xml.etree.ElementTree as ET # this module helps in processing XML files.

from datetime import datetime

### Set Path

tmpfile = "temp.tmp" # file used to store all extracted data

logfile = "logfile.txt" # all event logs will be stored in this file

targetfile = "transformed_data.csv" # file where transformed data is stored

### CSV Extract Function

def extract_from_csv(file_to_process):

dataframe = pd.read_csv(file_to_process)

return dataframe

### JSON Extract Function

def extract_from_json(file_to_process):

dataframe = pd.read_json(file_to_process,lines=True)

return dataframe

### XML Extract Function

def extract_from_xml(file_to_process):

dataframe = pd.DataFrame(columns=["name", "height", "weight"])

tree = ET.parse(file_to_process)

root = tree.getroot()

for person in root:

name = person.find("name").text

height = float(person.find("height").text)

weight = float(person.find("weight").text)

dataframe = dataframe.append({"name":name, "height":height, "weight":weight},

ignore_index=True)

return dataframe

### Extract Function

def extract():

extracted_data = pd.DataFrame(columns=['name','height','weight']) # create an empty data

frame to hold extracted data

#process all csv files

for csvfile in glob.glob(r"C:\Nazmul\Blog\Python\ETL\source\*.csv"):

extracted_data = extracted_data.append(extract_from_csv(csvfile), ignore_index=True)

#process all json files

for jsonfile in glob.glob(r"C:\Nazmul\Blog\Python\ETL\source\*.json"):

extracted_data = extracted_data.append(extract_from_json(jsonfile), ignore_index=True)

#process all xml files

for xmlfile in glob.glob(r"C:\Nazmul\Blog\Python\ETL\source\*.xml"):

extracted_data = extracted_data.append(extract_from_xml(xmlfile), ignore_index=True)

return extracted_data

### Transform

#The transform function does the following tasks.

#1. Convert height which is in inches to millimeter

#2. Convert weight which is in pounds to kilograms

def transform(data):

#Convert height which is in inches to millimeter

#Convert the datatype of the column into float

#data.height = data.height.astype(float)

#Convert inches to meters and round off to two decimals(one inch is 0.0254 meters)

data['height'] = round(data.height * 0.0254,2)

#Convert weight which is in pounds to kilograms

#Convert the datatype of the column into float

#data.weight = data.weight.astype(float)

#Convert pounds to kilograms and round off to two decimals(one pound is 0.45359237

kilograms)

data['weight'] = round(data.weight * 0.45359237,2)

return data

### Loading

def load(targetfile,data_to_load):

data_to_load.to_csv(targetfile)

### Logging

def log(message):

timestamp_format = '%Y-%h-%d-%H:%M:%S' # Year-Monthname-Day-Hour-Minute-Second

now = datetime.now() # get current timestamp

timestamp = now.strftime(timestamp_format)

with open("logfile.txt","a") as f:

f.write(timestamp + ',' + message + '\n')

###Running ETL Process

log("ETL Job Started")

log("Extract phase Started")

extracted_data = extract()

log("Extract phase Ended")

extracted_data

log("Transform phase Started")

transformed_data = transform(extracted_data)

log("Transform phase Ended")

transformed_data

log("Load phase Started")

load(targetfile,transformed_data)

log("Load phase Ended")

log("ETL Job Ended")