python web scraping (stockfetcher.com pull)
Python code below,
It fetches all contents from 5 urls, general discussion, filter exchange, public filters, stock picks and indicators.
Destination dir: /tmp
File type: csv
#!/usr/bin/env python from lxml import html import requests import csv import sys import humanize reload(sys) sys.setdefaultencoding('utf-8') def printable(input_str): return ''.join([i if ord(i) < 128 else ' ' for i in input_str]) def process_url_content(input_url, output_file): print "\nFetching contents from " + input_url buy_tx_page = requests.get(input_url) print "\nDownloaded " + humanize.naturalsize(len(buy_tx_page.content), gnu=True) + " bytes." tree = html.fromstring(buy_tx_page.content) subject = tree.xpath('//table[@class="table table-condensed"]/tbody/tr/td[1]/a/text()') rel_link = tree.xpath('//table[@class="table table-condensed"]/tbody/tr/td[1]/a/@href') user_name = tree.xpath('//table[@class="table table-condensed"]/tbody/tr/td[2]/text()') replies = tree.xpath('//table[@class="table table-condensed"]/tbody/tr/td[3]/text()') last_date = tree.xpath('//table[@class="table table-condensed"]/tbody/tr/td[4]/text()') if len(subject) > 0: output = open(output_file, 'a') # 'wt' loop_len = len(subject) i = 0 try: writer = csv.writer(output) writer.writerow(('Subject', 'URL', 'Clickable URL', 'Author', 'Replies', 'Last')) try: while i < loop_len: http_url = 'http://www.stockfetcher.com' + rel_link[i] writer.writerow((subject[i], http_url, '=HYPERLINK("' + http_url + '")', user_name[i], replies[i], printable(last_date[i]))) i += 1 except IndexError: pass finally: output.close() print "\nWrote " + str(i) + " records in " + output_file return 1 else: return 0 # http://www.stockfetcher.com/forums2/General-Discussion # http://www.stockfetcher.com/forums2/Filter-Exchange # http://www.stockfetcher.com/forums2/Stock-Picks # http://www.stockfetcher.com/forums2/Indicators # http://www.stockfetcher.com/forums2/Public-Filters url_list = ['http://www.stockfetcher.com/forums2/General-Discussion', 'http://www.stockfetcher.com/forums2/Filter-Exchange', 'http://www.stockfetcher.com/forums2/Stock-Picks', 'http://www.stockfetcher.com/forums2/Indicators', 'http://www.stockfetcher.com/forums2/Public-Filters'] for link in url_list: output_filename = '/tmp/' + link.rsplit('/', 1)[-1] + '.csv' func_ret = 1 running_val = 0 jump_val = 50 while func_ret == 1: func_ret = process_url_content(link + '/' + str(running_val), output_filename) running_val = running_val + jump_val