Monthly Archives: November 2011

Curl Single Threaded Crawler using TOR

import subprocess
import sys, threading, time
import pycurl

def refresh_ip():
	print "Refreshing IP .. ."
	try:
		process = subprocess.Popen('sudo /etc/init.d/tor restart', shell=True, stdout=subprocess.PIPE)
	except Exception, ex:
		print "Failed to Refresh IP. ", ex

# We should ignore SIGPIPE when using pycurl.NOSIGNAL - see
# the libcurl tutorial for more info.
try:
    import signal
    from signal import SIGPIPE, SIG_IGN
    signal.signal(signal.SIGPIPE, signal.SIG_IGN)
except ImportError:
    pass

class Test(threading.Thread):
    def __init__(self, url, ofile):
        threading.Thread.__init__(self)
        self.curl = pycurl.Curl()
        self.curl.setopt(pycurl.URL, url)
        self.curl.setopt(pycurl.WRITEDATA, ofile)
        self.curl.setopt(pycurl.FOLLOWLOCATION, 1)
        self.curl.setopt(pycurl.MAXREDIRS, 5)
        self.curl.setopt(pycurl.NOSIGNAL, 1)
        self.curl.setopt(pycurl.USERAGENT, 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.1 (KHTML, like Gecko) Ubuntu/11.04 Chromium/14.0.835.202 Chrome/14.0.835.202 Safari/535.1')
        self.curl.setopt(pycurl.PROXY, '127.0.0.1:9050')
        self.curl.setopt(pycurl.PROXYTYPE, pycurl.PROXYTYPE_SOCKS5)
        self.curl.setopt(pycurl.REFERER, 'http://www.google.co.in/')

    def run(self):
        self.curl.perform()
        self.curl.close()
        sys.stdout.write(".")
        sys.stdout.flush()

# Read list of URIs from file specified on commandline
try:
    urls = open(sys.argv[1]).readlines()
except IndexError:
    # No file was specified, show usage string
    print "Usage: %s <file with uris to fetch>" % sys.argv[0]
    raise SystemExit

# Initialize thread array and the file number
threads = []

# Start one thread per URI in sequence
fileno = 0
t1 = time.time()
for url in urls:

    f = open(str(fileno), "wb")
    t = Test(url.rstrip(), f)
    t.start()
    fileno = fileno + 1
    t.join()
    f.close()
    refresh_ip()
    time.sleep(3)
t2 = time.time()
print "\n** Singlethreading, %d seconds elapsed for %d uris" % (int(t2-t1), len(urls))

Advertisements
Tagged , , , , , , ,

LXML HTML Parsing Usage Example


from lxml.html import fromstring

f=open("result_urls","a+")
for x in range(0,1000):
	mySearchTree = fromstring(open(str(x)).read())
	f.write("Product Name: ")

	for a in mySearchTree.cssselect('tr input'):
		f.write(a.get('value')) #product_name
		f.write(chr(10))
	f.write("URLS:")
	f.write(chr(10))
	for a in mySearchTree.cssselect('h3 a'):
		f.write(a.get('href')) #url
		f.write(chr(10))
	f.flush()
f.close()

Read all lines of file as array

From file as a parameter in argument

keywords = open(sys.argv[1]).readlines()

Empty Inbox


import imaplib
for x in range (0,10000):
	box = imaplib.IMAP4_SSL('imap.mail.microsoftonline.com', 993)
	print x, ":Connected"
	box.login("*******@nextag.microsoftonline.com","******")
	print "Logged In"
	box.select('Inbox')
	print "Inbox Selected"
	for num in range(1,100):
		box.store(num, '+FLAGS', '\\Deleted')
	print "100 mail selected"
	box.expunge()
	box.close()
	box.logout()
print "I think all mails have been deleted"

delete all email


import imaplib
box = imaplib.IMAP4_SSL('imap.mail.microsoftonline.com', 993)
box.login("user@nextag.microsoftonline.com","password")
box.select('Inbox')
typ, data = box.search(None, 'ALL')
for num in data[0].split():
 box.store(num, '+FLAGS', '\\Deleted')
box.expunge()
box.close()
box.logout()

Logging Traceback


import traceback
import sys
try:
	try:
		print x
	except Exception, ex:
		raise NameError
except Exception, er:
	x=traceback.format_exc()
	print x

Microsoft Online IMAP/SMTP Setting for Mozilla Thunderbird

IMAP
Account Name: NexTag
Your Name: Utsav Sabharwal
Email Address: usabharwal@nextag.microsoftonline.com
Server Name: imap.mail.microsoftonline.com
Port: 993
User Name: usabharwal@nextag.microsoftonline.com
Connectivity Security: SSl/TLS
Authentication Method: NTLM

SMTP
smtp.mail.microsoftonline.com
Port: 587
User Name: usabharwal@nextag.microsoftonline.com
Connectivity Security: STARTTLS
Authentication Method: Normal password

Google Free API

Input file at: https://dustycodes.wordpress.com/2011/11/14/input-uss/

import urllib
f=open("input.uss")
while(True):
	product_name=f.readline()
	if(product_name):
                product_name.strip()
                orig=product_name
                product_name.replace(' ','+')
		product_name.replace(',','%2c')
		google_search_url='https://www.googleapis.com/customsearch/v1?key=AIzaSyDbFnjndOj_hXgsw3MlaCevoUsIQ4egPMQ&amp;client=google-csbe&amp;cx=012749261079093371120:utxs7crkqqw&amp;fields=items(link)&amp;q='+product_name+'+reviews+rating+more:user_reviews+more:recent3+more:recent4+more:recent2'
		k = urllib.urlopen(google_search_url)
		urls = k.read()
		for x in range(0,10):
                        url = urls[urls.find('"link": "')+9:]
                        url = url[:url.find('"')]
                        content=urllib.urlopen(url).read()
                        if(content.find("reviews")&gt;-1):
                                if(content.find(orig)&gt;-1):
                                        print orig
                        urls=urls[urls.find(url)+len(url):]
                        
                           
	else:
		break
		
		
	

input.uss

BCBGMAXAZRIA Cotton Voile Sundress  Women's Dress - Navy
Men's Twill and Mesh Safari Hat
Jordan Retro 13 Men's Hoodie
Laminate Flooring Kempas Floors 12mm Floor Wood Hardwood option
G Meso Netbook with Intel Atom Processor - Pink
Wheeled Business Case, 13" H x 17" W x 6 1 - 2" -  Black
K-3487-47 Bancroft Comfort Height Elongated Toilet
Riverside Urban Crossings L-Shaped Desk with Hutch - Espresso - RVS560

Automated Process Starter


while true
do
        killall paster
        sleep 10
        paster job_runner --configuration-file machine/ec2/spider/ini/spider.ini --job-name=urlbroker:Starter &
        sleep 14400
done