WSJ Top Loser Gainer pull
Below code will pull losers and gainers from wsj.com for a given date range,
#!/usr/bin/env python from lxml import html import requests import sys import os import humanize import pandas # from datetime import date, timedelta import datetime import re class StockType: Loser = 0 Gainer = 1 # gainer # http://www.wsj.com/mdc/public/page/2_3021-gaincomp-gainer-20170503.html # loser # http://www.wsj.com/mdc/public/page/2_3021-losecomp-loser-20170502.html def fetch_wsj_page_content(stock_type, dt): dt_str = dt.strftime("%Y%m%d") if stock_type == StockType.Gainer: url = 'http://www.wsj.com/mdc/public/page/2_3021-gaincomp-gainer-' + dt_str + '.html' stock_type = "Gainer" elif stock_type == StockType.Loser: url = 'http://www.wsj.com/mdc/public/page/2_3021-losecomp-loser-' + dt_str + '.html' stock_type = "Loser" else: return '' if url == '': return print "Fetching contents from " + url # return page_val = requests.get(url) print "Downloaded " + humanize.naturalsize(len(page_val.content), gnu=True) + " bytes." tree = html.fromstring(page_val.content) rank = tree.xpath('//table[@class="mdcTable"]/tr[position()>1]/td[1]/text()') company = tree.xpath('//table[@class="mdcTable"]/tr/td[2]/a/text()') price = tree.xpath('//table[@class="mdcTable"]/tr[position()>1]/td[3]/text()') change = tree.xpath('//table[@class="mdcTable"]/tr[position()>1]/td[4]/text()') change_percent = tree.xpath('//table[@class="mdcTable"]/tr[position()>1]/td[5]/text()') volume = tree.xpath('//table[@class="mdcTable"]/tr[position()>1]/td[6]/text()') loop_len = len(company) i = 0 # print headers str_out = "rank\tticker\tcompany\tprice\tchange\tchange%\tvolume\tType\tdate\n" while i < loop_len: # print company[i].replace('\r', '').replace('\n', '') + "\t" + price[i].replace('$', '') + "\t" + change[ # i] + "\t" + change_percent[i] + "\t" + volume[i] ticker = '' m = re.search('\((.+?)\)', company[i]) if m: ticker = m.group(1) str_out += rank[i] + "\t" + ticker + "\t" + company[i].replace('\r', '').replace('\n', '') + "\t" + price[i].replace('$', '') + "\t" + change[ i] + "\t" + change_percent[i] + "\t" + volume[i] + "\t" + stock_type + "\t" + dt_str + '\n' i += 1 return str_out curr_arg = 0 for arg in sys.argv: print "Argument " + str(curr_arg) + ": " + arg curr_arg += 1 if len(sys.argv) != 4: print "\nIncorrect set of arguments! (expected 3)\nUsage: " + os.path.basename(__file__) + " <start_date> <end_date> <output dir>\nDate format: YYYYMMDD\n" exit(1) start_date = sys.argv[1] end_date = sys.argv[2] destination_dir = sys.argv[3] start_date = datetime.datetime.strptime(start_date, "%Y%m%d").date() end_date = datetime.datetime.strptime(end_date, "%Y%m%d").date() # # this will give you a list containing all of the dates (old school) # date_range_list = [start_date + timedelta(days=x) for x in range((end_date - start_date).days + 1)] date_range = pandas.bdate_range(start_date, end_date) for dt in date_range: print '\n[ Processing date: ' + dt.strftime("%Y%m%d") + ' ]' table_val = fetch_wsj_page_content(StockType.Gainer, dt) if table_val: output_file = destination_dir + "/" + dt.strftime("%Y%m%d") + "_gainers.tsv" print "Writing to file " + output_file f = open(output_file, 'w') f.write(table_val) table_val = fetch_wsj_page_content(StockType.Loser, dt) if table_val: output_file = destination_dir + "/" + dt.strftime("%Y%m%d") + "_losers.tsv" print "Writing to file " + output_file f = open(output_file, 'w') f.write(table_val) print '\nDone'