Parsing sqoop logs for stats analysis
Below python code will help you extract statistics from a set of Sqoop log files for transfer analysis,
#!/usr/bin/env python import fnmatch import os import datetime def find_files(directory, pattern): for root, dirs, files in os.walk(directory): for basename in files: if fnmatch.fnmatch(basename, pattern): filename = os.path.join(root, basename) yield filename def change_dt_format(dt_val): new_dt = datetime.datetime.strptime(dt_val, '%y/%m/%d %H:%M:%S').strftime('%m/%d/%y %H:%M:%S') # new_dt = dt_val return new_dt def datetime_diff_minutes(start_ts, end_ts): fmt = '%y/%m/%d %H:%M:%S' start = datetime.datetime.strptime(start_ts, fmt) end = datetime.datetime.strptime(end_ts, fmt) return str((end-start).seconds/60) start_dir = '/Users/robin/Downloads/EDW' file_pattern = '*.log' block_begin = 'Importing from Teradata Table:' block_end = 'Teradata import job completed with exit code' file_bytes_read = 'FILE: Number of bytes read=' file_bytes_written = 'FILE: Number of bytes written=' hdfs_bytes_read = 'HDFS: Number of bytes read=' hdfs_bytes_written = 'HDFS: Number of bytes written=' map_input_rec = 'Map input records=' map_output_rec = 'Map output records=' time_maps_in_oslots = 'Total time spent by all maps in occupied slots (ms)=' total_column_count_oneless = 9; timestamp_start = "" timestamp_end = "" # print headers below print 'State\tStart Time\tTable name\tfile_bytes_read\tfile_bytes_written\thdfs_bytes_read\thdfs_bytes_written\tMap time spent\tMap In Records\tMap Out Records\tEnd Time\tTime Diff(mins)\tFilename' the_record = '' for filename in find_files(start_dir, file_pattern): f = open(filename, 'r') for line in f: pos = line.find(block_begin) if pos != -1: # the_record = "Start: " # the_record += line[:17] the_record += change_dt_format(line[:17]) timestamp_start = line[:17] the_record += "\t" the_record += line[pos+len(block_begin):-1] # print the_record pos = line.find(file_bytes_read) if pos != -1: the_record += "\t" the_record += line[pos+len(file_bytes_read):-1] pos = line.find(file_bytes_written) if pos != -1: the_record += "\t" the_record += line[pos+len(file_bytes_written):-1] pos = line.find(hdfs_bytes_read) if pos != -1: the_record += "\t" the_record += line[pos+len(hdfs_bytes_read):-1] pos = line.find(hdfs_bytes_written) if pos != -1: the_record += "\t" the_record += line[pos+len(hdfs_bytes_written):-1] pos = line.find(map_input_rec) if pos != -1: the_record += "\t" the_record += line[pos+len(map_input_rec):-1] pos = line.find(map_output_rec) if pos != -1: the_record += "\t" the_record += line[pos+len(map_output_rec):-1] pos = line.find(time_maps_in_oslots) if pos != -1: the_record += "\t" the_record += line[pos+len(time_maps_in_oslots):-1] pos = line.find(block_end) if pos != -1: the_record += "\t" # the_record += line[:17] the_record += change_dt_format(line[:17]) timestamp_end = line[:17] if the_record.count("\t") == total_column_count_oneless: print "Good\t" + the_record + "\t" + datetime_diff_minutes(timestamp_start, timestamp_end) + "\t" + filename # else: # print "Bad\t" + the_record + "\t" + filename # the_record = '' the_record = timestamp_end = timestamp_start = ''