Tag Archives: script

Zappos Review Parser

def create_hashing_table():
import md5
urls=open(“product_id_with_urls”).readlines()
hashing_table = []
for url in urls:
u = url
url = url.split(“::”)[1]
if(len(url)>1):
file_name = str(md5.new(url.strip()).hexdigest())+”.uss”
hashing_table.append(str(file_name+”:.:”+u))
return hashing_table

def find_date(content):
try:
pattern = ‘<div>’
l = len(pattern)
focus = content[content.find(pattern)+l:content.find(‘<div>’)]
return str(focus[focus.find(‘<p>’)+3:focus.find(‘</p’)]).strip()
except Exception, ex:
return False

def find_review(content):
try:
pattern = ‘<p’
l = len(pattern)
focus = content[content.find(pattern)+l:content.find(‘<p>’)]
return str(focus[focus.find(‘>’)+1:focus.find(‘</p’)]).strip()
except Exception, ex:
return False

def find_author(content):
try:
pattern = ‘<div>’
l = len(pattern)
focus = content[content.find(pattern)+l:content.find(‘<p>’)]
return str(focus[focus.find(‘<h3>’)+4:focus.find(‘</h3’)]).strip()
except Exception, ex:
return False

def find_rating(content):
try:
pattern = ‘<div>’
l = len(pattern)
focus = content[content.find(pattern)+l:content.find(‘<p><strong>Comfort</strong>’)]
return str(int(focus[focus.find(‘Rated:’)+6:focus.find(‘ stars’)])).strip()
except Exception, ex:
return False

def start():
import os
hashing_table = create_hashing_table()
count = 0
success = 0
for filename in os.listdir(’round2′):
for x in hashing_table:
#print x, filename
if(x.find(filename)>-1):
#print “matched”
url = x.split(“:.:”)[1].split(“::”)[1]
product_id = x.split(“:.:”)[1].split(“::”)[0]
if(url.find(“www.zappos.com”)>-1):
if(parse_reviews(filename, url, product_id)):
success+=1
count+=1
print success, “/”, count

def parse_reviews(filename, url, product_id):
html = open(“round2/”+filename).read()
content = html.split(‘<div>’)[1:]
for x in content:
date = find_date(x)
review = find_review(x)
rating = find_rating(x)
author = find_author(x)
if(product_id and url and date and review and rating and author):
try:
create_uml(product_id, url, date, review, rating, author)

except Exception, ex:
print “Error Writing”, filename, ex
return False
else:
print “Error Parsing “, filename, product_id, url, date, review, rating, author
return False
return True

def create_uml(product_id, url, date, review, rating, author):
pattern = product_id+”:::”+url+”:::”+date+”:::”+review+”:::”+rating+”:::”+author+”—>”
#print pattern
f=open(“zappos.uml”,”a+”)
f.write(pattern)
f.close()

start()

Advertisements
Tagged , , , , ,

Email via UNIX Command

Command:

mail -s “subject” to-email@id.com -a “Reply-To” <<“Your Message”

Note:

Press enter to change line, ‘\n’
etc wont work

Example:

mail -s “URLBroker Process Failure” utsav.sabharwal@nextag.com,jyadav@nextag.com -a “”<<< “Hie Team”

Tagged , , , , ,

Writing a Cron Job

Open Cron:

crontab -e

0 * * * * * filename <– command for hourly run update

Note:

Make sure file is executable

chmod 0755

.sh file does define which application to use to open this file.

#!/bin/bash <– add this line to top of your .sh code

Tagged , , , , ,

Curl Single Threaded Crawler using TOR

import subprocess
import sys, threading, time
import pycurl

def refresh_ip():
	print "Refreshing IP .. ."
	try:
		process = subprocess.Popen('sudo /etc/init.d/tor restart', shell=True, stdout=subprocess.PIPE)
	except Exception, ex:
		print "Failed to Refresh IP. ", ex

# We should ignore SIGPIPE when using pycurl.NOSIGNAL - see
# the libcurl tutorial for more info.
try:
    import signal
    from signal import SIGPIPE, SIG_IGN
    signal.signal(signal.SIGPIPE, signal.SIG_IGN)
except ImportError:
    pass

class Test(threading.Thread):
    def __init__(self, url, ofile):
        threading.Thread.__init__(self)
        self.curl = pycurl.Curl()
        self.curl.setopt(pycurl.URL, url)
        self.curl.setopt(pycurl.WRITEDATA, ofile)
        self.curl.setopt(pycurl.FOLLOWLOCATION, 1)
        self.curl.setopt(pycurl.MAXREDIRS, 5)
        self.curl.setopt(pycurl.NOSIGNAL, 1)
        self.curl.setopt(pycurl.USERAGENT, 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.1 (KHTML, like Gecko) Ubuntu/11.04 Chromium/14.0.835.202 Chrome/14.0.835.202 Safari/535.1')
        self.curl.setopt(pycurl.PROXY, '127.0.0.1:9050')
        self.curl.setopt(pycurl.PROXYTYPE, pycurl.PROXYTYPE_SOCKS5)
        self.curl.setopt(pycurl.REFERER, 'http://www.google.co.in/')

    def run(self):
        self.curl.perform()
        self.curl.close()
        sys.stdout.write(".")
        sys.stdout.flush()

# Read list of URIs from file specified on commandline
try:
    urls = open(sys.argv[1]).readlines()
except IndexError:
    # No file was specified, show usage string
    print "Usage: %s <file with uris to fetch>" % sys.argv[0]
    raise SystemExit

# Initialize thread array and the file number
threads = []

# Start one thread per URI in sequence
fileno = 0
t1 = time.time()
for url in urls:

    f = open(str(fileno), "wb")
    t = Test(url.rstrip(), f)
    t.start()
    fileno = fileno + 1
    t.join()
    f.close()
    refresh_ip()
    time.sleep(3)
t2 = time.time()
print "\n** Singlethreading, %d seconds elapsed for %d uris" % (int(t2-t1), len(urls))

Tagged , , , , , , ,