Zappos Review Parser

def create_hashing_table():
import md5
urls=open(“product_id_with_urls”).readlines()
hashing_table = []
for url in urls:
u = url
url = url.split(“::”)[1]
if(len(url)>1):
file_name = str(md5.new(url.strip()).hexdigest())+”.uss”
hashing_table.append(str(file_name+”:.:”+u))
return hashing_table

def find_date(content):
try:
pattern = ‘<div>’
l = len(pattern)
focus = content[content.find(pattern)+l:content.find(‘<div>’)]
return str(focus[focus.find(‘<p>’)+3:focus.find(‘</p’)]).strip()
except Exception, ex:
return False

def find_review(content):
try:
pattern = ‘<p’
l = len(pattern)
focus = content[content.find(pattern)+l:content.find(‘<p>’)]
return str(focus[focus.find(‘>’)+1:focus.find(‘</p’)]).strip()
except Exception, ex:
return False

def find_author(content):
try:
pattern = ‘<div>’
l = len(pattern)
focus = content[content.find(pattern)+l:content.find(‘<p>’)]
return str(focus[focus.find(‘<h3>’)+4:focus.find(‘</h3’)]).strip()
except Exception, ex:
return False

def find_rating(content):
try:
pattern = ‘<div>’
l = len(pattern)
focus = content[content.find(pattern)+l:content.find(‘<p><strong>Comfort</strong>’)]
return str(int(focus[focus.find(‘Rated:’)+6:focus.find(‘ stars’)])).strip()
except Exception, ex:
return False

def start():
import os
hashing_table = create_hashing_table()
count = 0
success = 0
for filename in os.listdir(’round2′):
for x in hashing_table:
#print x, filename
if(x.find(filename)>-1):
#print “matched”
url = x.split(“:.:”)[1].split(“::”)[1]
product_id = x.split(“:.:”)[1].split(“::”)[0]
if(url.find(“www.zappos.com”)>-1):
if(parse_reviews(filename, url, product_id)):
success+=1
count+=1
print success, “/”, count

def parse_reviews(filename, url, product_id):
html = open(“round2/”+filename).read()
content = html.split(‘<div>’)[1:]
for x in content:
date = find_date(x)
review = find_review(x)
rating = find_rating(x)
author = find_author(x)
if(product_id and url and date and review and rating and author):
try:
create_uml(product_id, url, date, review, rating, author)

except Exception, ex:
print “Error Writing”, filename, ex
return False
else:
print “Error Parsing “, filename, product_id, url, date, review, rating, author
return False
return True

def create_uml(product_id, url, date, review, rating, author):
pattern = product_id+”:::”+url+”:::”+date+”:::”+review+”:::”+rating+”:::”+author+”—>”
#print pattern
f=open(“zappos.uml”,”a+”)
f.write(pattern)
f.close()

start()

Advertisements
Tagged , , , , ,

Leave a Reply

Fill in your details below or click an icon to log in:

WordPress.com Logo

You are commenting using your WordPress.com account. Log Out / Change )

Twitter picture

You are commenting using your Twitter account. Log Out / Change )

Facebook photo

You are commenting using your Facebook account. Log Out / Change )

Google+ photo

You are commenting using your Google+ account. Log Out / Change )

Connecting to %s

%d bloggers like this: