Monthly Archives: December 2011

Inserting Data from a text file to MySQL Table

  • Each column must be tab separated
  • Each row must be a new line
  • Total number of columns in file should be equal to total number of columns in text file to avoid ambiguity

Step 1: Copy the file to local folder of MySQL Machine (root) folder)

Step 2:  Run following command on MySQL interface:

load data local infile “filename” into table data_table; 

Reference: http://forums.mysql.com/read.php?10,155114,155141

Xpath in Python using lxml

def extract_html(xpath, html):
# This method takes xpath and html content as input 
# and returns a list of tags and corresponding content seperated by ':::'
	result=[]
	from lxml import etree
	tree = etree.HTML(html)
	r = tree.xpath(xpath)
	for x in r:
		pattern = x.tag+":::"+x.text
		result.append(pattern)

Zappos Review Parser

def create_hashing_table():
import md5
urls=open(“product_id_with_urls”).readlines()
hashing_table = []
for url in urls:
u = url
url = url.split(“::”)[1]
if(len(url)>1):
file_name = str(md5.new(url.strip()).hexdigest())+”.uss”
hashing_table.append(str(file_name+”:.:”+u))
return hashing_table

def find_date(content):
try:
pattern = ‘<div>’
l = len(pattern)
focus = content[content.find(pattern)+l:content.find(‘<div>’)]
return str(focus[focus.find(‘<p>’)+3:focus.find(‘</p’)]).strip()
except Exception, ex:
return False

def find_review(content):
try:
pattern = ‘<p’
l = len(pattern)
focus = content[content.find(pattern)+l:content.find(‘<p>’)]
return str(focus[focus.find(‘>’)+1:focus.find(‘</p’)]).strip()
except Exception, ex:
return False

def find_author(content):
try:
pattern = ‘<div>’
l = len(pattern)
focus = content[content.find(pattern)+l:content.find(‘<p>’)]
return str(focus[focus.find(‘<h3>’)+4:focus.find(‘</h3’)]).strip()
except Exception, ex:
return False

def find_rating(content):
try:
pattern = ‘<div>’
l = len(pattern)
focus = content[content.find(pattern)+l:content.find(‘<p><strong>Comfort</strong>’)]
return str(int(focus[focus.find(‘Rated:’)+6:focus.find(‘ stars’)])).strip()
except Exception, ex:
return False

def start():
import os
hashing_table = create_hashing_table()
count = 0
success = 0
for filename in os.listdir(’round2′):
for x in hashing_table:
#print x, filename
if(x.find(filename)>-1):
#print “matched”
url = x.split(“:.:”)[1].split(“::”)[1]
product_id = x.split(“:.:”)[1].split(“::”)[0]
if(url.find(“www.zappos.com”)>-1):
if(parse_reviews(filename, url, product_id)):
success+=1
count+=1
print success, “/”, count

def parse_reviews(filename, url, product_id):
html = open(“round2/”+filename).read()
content = html.split(‘<div>’)[1:]
for x in content:
date = find_date(x)
review = find_review(x)
rating = find_rating(x)
author = find_author(x)
if(product_id and url and date and review and rating and author):
try:
create_uml(product_id, url, date, review, rating, author)

except Exception, ex:
print “Error Writing”, filename, ex
return False
else:
print “Error Parsing “, filename, product_id, url, date, review, rating, author
return False
return True

def create_uml(product_id, url, date, review, rating, author):
pattern = product_id+”:::”+url+”:::”+date+”:::”+review+”:::”+rating+”:::”+author+”—>”
#print pattern
f=open(“zappos.uml”,”a+”)
f.write(pattern)
f.close()

start()

Tagged , , , , ,

To check if a page has Bazaar Voice / HReview / Power Review Structure

import os
count = 0
f1=open("shortlisted","a+")
for file in os.listdir('.'):
    f=open(file)
    content = f.read().lower()
    f.close()
    if(content.find('hreview')>-1 or content.find('prreviewwrap')>-1 or content.find('pr-review-wrap')>-1 or content.find('bvstandalonereviewsectionreview')>-1 or content.find('bvrrcontentreview')>-1 ):
	
	text = str("mv crawler_6DEC/"+file+" shortlisted/"+file)+chr(10)
	print text
	f1.write(text)	
	f1.flush()
	count+=1

print count
f1.close() 	

Search a filename in the current directory using pattern matching

import os 
import fnmatch
for file in os.listdir('.'):
    if fnmatch.fnmatch(file, '*.txt'):
        return file
Tagged , , , ,

Domain Extractor (Draft)

import urllib

def get_domain(url):

url = url.split(".")

tlds = urllib.urlopen("http://data.iana.org/TLD/tlds-alpha-by-domain.txt").readlines()

for x in range(0, len(url)):

try:

xx=url[x].split("/")

for y in xx:

s = str()

if(tlds.find(y)>-1):

return url[x-1]

except Exception, ex:

print x, ex

continue

Shell Script to find a process using grep and taking an accortion accordingly using if else

x=start.sh
y=`ps -ef | grep start.sh | awk '{print $9}' |head -1`
if [ $y = $x ]
then
	mail -s "SUCCESS: Fetcher Hourly Check" utsav.sabharwal@nextag.com,jyadav@nextag.com -a ""<<< "Hie Team,

This is to inform you that start.sh is running smoothly. So just sit back and chill.. .

I would recheck after one hour.

Best Regards,
Unix Script
etc/cron.hourly/mycron.sh
BETA Fetcher"

else
	mail -s "FAILURE: Fetcher Hourly Check" utsav.sabharwal@nextag.com,jyadav@nextag.com -a ""<<< "Hie Team,

This is to inform you that start.sh is not running on BETA URLBroker. Take appropriate action asap.

Please note, that I have been designed to update you guys every hour if I don't find a paster process running on my machine.

Best Regards,
Unix Script
/mycron.sh
BETA Fetcher"

Email via UNIX Command

Command:

mail -s “subject” to-email@id.com -a “Reply-To” <<“Your Message”

Note:

Press enter to change line, ‘\n’
etc wont work

Example:

mail -s “URLBroker Process Failure” utsav.sabharwal@nextag.com,jyadav@nextag.com -a “”<<< “Hie Team”

Tagged , , , , ,

Writing a Cron Job

Open Cron:

crontab -e

0 * * * * * filename <– command for hourly run update

Note:

Make sure file is executable

chmod 0755

.sh file does define which application to use to open this file.

#!/bin/bash <– add this line to top of your .sh code

Tagged , , , , ,