We will do the following steps using Python
Please Download the "SOURCE.zip" Zip file and extract it. Replace “C:\Nazmul\Blog\Python\ETL\source\” to the location where you have extracted the Zip File. The following Python script is attacehed "ETL script using Python (Example 1).py" ##Import the required modules and functions import glob # this module helps in selecting files import pandas as pd # this module helps in processing CSV files import xml.etree.ElementTree as ET # this module helps in processing XML files. from datetime import datetime
### Set Path tmpfile = "temp.tmp" # file used to store all extracted data logfile = "logfile.txt" # all event logs will be stored in this file targetfile = "transformed_data.csv" # file where transformed data is stored
### CSV Extract Function def extract_from_csv(file_to_process): dataframe = pd.read_csv(file_to_process) return dataframe ### JSON Extract Function def extract_from_json(file_to_process): dataframe = pd.read_json(file_to_process,lines=True) return dataframe ### XML Extract Function def extract_from_xml(file_to_process): dataframe = pd.DataFrame(columns=["name", "height", "weight"]) tree = ET.parse(file_to_process) root = tree.getroot() for person in root: name = person.find("name").text height = float(person.find("height").text) weight = float(person.find("weight").text) dataframe = dataframe.append({"name":name, "height":height, "weight":weight}, ignore_index=True) return dataframe
### Extract Function def extract(): extracted_data = pd.DataFrame(columns=['name','height','weight']) # create an empty data frame to hold extracted data #process all csv files for csvfile in glob.glob(r"C:\Nazmul\Blog\Python\ETL\source\*.csv"): extracted_data = extracted_data.append(extract_from_csv(csvfile), ignore_index=True)
#process all json files for jsonfile in glob.glob(r"C:\Nazmul\Blog\Python\ETL\source\*.json"): extracted_data = extracted_data.append(extract_from_json(jsonfile), ignore_index=True)
#process all xml files for xmlfile in glob.glob(r"C:\Nazmul\Blog\Python\ETL\source\*.xml"): extracted_data = extracted_data.append(extract_from_xml(xmlfile), ignore_index=True) return extracted_data
### Transform #The transform function does the following tasks. #1. Convert height which is in inches to millimeter #2. Convert weight which is in pounds to kilograms def transform(data): #Convert height which is in inches to millimeter #Convert the datatype of the column into float #data.height = data.height.astype(float) #Convert inches to meters and round off to two decimals(one inch is 0.0254 meters) data['height'] = round(data.height * 0.0254,2) #Convert weight which is in pounds to kilograms #Convert the datatype of the column into float #data.weight = data.weight.astype(float) #Convert pounds to kilograms and round off to two decimals(one pound is 0.45359237 kilograms) data['weight'] = round(data.weight * 0.45359237,2) return data
### Loading def load(targetfile,data_to_load): data_to_load.to_csv(targetfile)
### Logging def log(message): timestamp_format = '%Y-%h-%d-%H:%M:%S' # Year-Monthname-Day-Hour-Minute-Second now = datetime.now() # get current timestamp timestamp = now.strftime(timestamp_format) with open("logfile.txt","a") as f: f.write(timestamp + ',' + message + '\n')
###Running ETL Process log("ETL Job Started") log("Extract phase Started") extracted_data = extract() log("Extract phase Ended") extracted_data log("Transform phase Started") transformed_data = transform(extracted_data) log("Transform phase Ended") transformed_data log("Load phase Started") load(targetfile,transformed_data) log("Load phase Ended")
log("ETL Job Ended") |