WSJ Top Loser Gainer pull

Below code will pull losers and gainers from wsj.com for a given date range,

#!/usr/bin/env python
from lxml import html
import requests
import sys
import os
import humanize
import pandas
# from datetime import date, timedelta
import datetime
import re


class StockType:
    Loser = 0
    Gainer = 1

# gainer
# http://www.wsj.com/mdc/public/page/2_3021-gaincomp-gainer-20170503.html
# loser
# http://www.wsj.com/mdc/public/page/2_3021-losecomp-loser-20170502.html


def fetch_wsj_page_content(stock_type, dt):

    dt_str = dt.strftime("%Y%m%d")

    if stock_type == StockType.Gainer:
        url = 'http://www.wsj.com/mdc/public/page/2_3021-gaincomp-gainer-' + dt_str + '.html'
        stock_type = "Gainer"
    elif stock_type == StockType.Loser:
        url = 'http://www.wsj.com/mdc/public/page/2_3021-losecomp-loser-' + dt_str + '.html'
        stock_type = "Loser"
    else:
        return ''

    if url == '':
        return

    print "Fetching contents from " + url
    # return
    page_val = requests.get(url)
    print "Downloaded " + humanize.naturalsize(len(page_val.content), gnu=True) + " bytes."
    tree = html.fromstring(page_val.content)

    rank = tree.xpath('//table[@class="mdcTable"]/tr[position()>1]/td[1]/text()')
    company = tree.xpath('//table[@class="mdcTable"]/tr/td[2]/a/text()')
    price = tree.xpath('//table[@class="mdcTable"]/tr[position()>1]/td[3]/text()')
    change = tree.xpath('//table[@class="mdcTable"]/tr[position()>1]/td[4]/text()')
    change_percent = tree.xpath('//table[@class="mdcTable"]/tr[position()>1]/td[5]/text()')
    volume = tree.xpath('//table[@class="mdcTable"]/tr[position()>1]/td[6]/text()')

    loop_len = len(company)
    i = 0
    # print headers
    str_out = "rank\tticker\tcompany\tprice\tchange\tchange%\tvolume\tType\tdate\n"

    while i < loop_len:
        # print company[i].replace('\r', '').replace('\n', '') + "\t" + price[i].replace('$', '') + "\t" + change[
        #     i] + "\t" + change_percent[i] + "\t" + volume[i]
        ticker = ''
        m = re.search('\((.+?)\)', company[i])
        if m:
            ticker = m.group(1)

        str_out += rank[i] + "\t" + ticker + "\t" + company[i].replace('\r', '').replace('\n', '') + "\t" + price[i].replace('$', '') + "\t" + change[
            i] + "\t" + change_percent[i] + "\t" + volume[i] + "\t" + stock_type + "\t" + dt_str + '\n'
        i += 1
    return str_out


curr_arg = 0
for arg in sys.argv:
    print "Argument " + str(curr_arg) + ": " + arg
    curr_arg += 1


if len(sys.argv) != 4:
    print "\nIncorrect set of arguments! (expected 3)\nUsage: " + os.path.basename(__file__) + " <start_date> <end_date> <output dir>\nDate format: YYYYMMDD\n"
    exit(1)


start_date = sys.argv[1]
end_date = sys.argv[2]
destination_dir = sys.argv[3]

start_date = datetime.datetime.strptime(start_date, "%Y%m%d").date()
end_date = datetime.datetime.strptime(end_date, "%Y%m%d").date()

# # this will give you a list containing all of the dates (old school)
# date_range_list = [start_date + timedelta(days=x) for x in range((end_date - start_date).days + 1)]

date_range = pandas.bdate_range(start_date, end_date)

for dt in date_range:
    print '\n[ Processing date: ' + dt.strftime("%Y%m%d") + ' ]'
    table_val = fetch_wsj_page_content(StockType.Gainer, dt)
    if table_val:
        output_file = destination_dir + "/" + dt.strftime("%Y%m%d") + "_gainers.tsv"
        print "Writing to file " + output_file
        f = open(output_file, 'w')
        f.write(table_val)

    table_val = fetch_wsj_page_content(StockType.Loser, dt)
    if table_val:
        output_file = destination_dir + "/" + dt.strftime("%Y%m%d") + "_losers.tsv"
        print "Writing to file " + output_file
        f = open(output_file, 'w')
        f.write(table_val)

print '\nDone'

 

You may also like...

Leave a Reply

Your email address will not be published. Required fields are marked *