Tag Archives: crawler

Curl Single Threaded Crawler using TOR

import subprocess
import sys, threading, time
import pycurl

def refresh_ip():
	print "Refreshing IP .. ."
	try:
		process = subprocess.Popen('sudo /etc/init.d/tor restart', shell=True, stdout=subprocess.PIPE)
	except Exception, ex:
		print "Failed to Refresh IP. ", ex

# We should ignore SIGPIPE when using pycurl.NOSIGNAL - see
# the libcurl tutorial for more info.
try:
    import signal
    from signal import SIGPIPE, SIG_IGN
    signal.signal(signal.SIGPIPE, signal.SIG_IGN)
except ImportError:
    pass

class Test(threading.Thread):
    def __init__(self, url, ofile):
        threading.Thread.__init__(self)
        self.curl = pycurl.Curl()
        self.curl.setopt(pycurl.URL, url)
        self.curl.setopt(pycurl.WRITEDATA, ofile)
        self.curl.setopt(pycurl.FOLLOWLOCATION, 1)
        self.curl.setopt(pycurl.MAXREDIRS, 5)
        self.curl.setopt(pycurl.NOSIGNAL, 1)
        self.curl.setopt(pycurl.USERAGENT, 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.1 (KHTML, like Gecko) Ubuntu/11.04 Chromium/14.0.835.202 Chrome/14.0.835.202 Safari/535.1')
        self.curl.setopt(pycurl.PROXY, '127.0.0.1:9050')
        self.curl.setopt(pycurl.PROXYTYPE, pycurl.PROXYTYPE_SOCKS5)
        self.curl.setopt(pycurl.REFERER, 'http://www.google.co.in/')

    def run(self):
        self.curl.perform()
        self.curl.close()
        sys.stdout.write(".")
        sys.stdout.flush()

# Read list of URIs from file specified on commandline
try:
    urls = open(sys.argv[1]).readlines()
except IndexError:
    # No file was specified, show usage string
    print "Usage: %s <file with uris to fetch>" % sys.argv[0]
    raise SystemExit

# Initialize thread array and the file number
threads = []

# Start one thread per URI in sequence
fileno = 0
t1 = time.time()
for url in urls:

    f = open(str(fileno), "wb")
    t = Test(url.rstrip(), f)
    t.start()
    fileno = fileno + 1
    t.join()
    f.close()
    refresh_ip()
    time.sleep(3)
t2 = time.time()
print "\n** Singlethreading, %d seconds elapsed for %d uris" % (int(t2-t1), len(urls))

Advertisements
Tagged , , , , , , ,