Category Archives: Python Scripts

Showing Exception in detail in python

import sys, os
try:
    raise NotImplementedError("No error")
except Exception, e:
    exc_type, exc_obj, exc_tb = sys.exc_info()
    fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1]      
    print(exc_type, fname, exc_tb.tb_lineno)

Advertisements

Curl Single Threaded Crawler using TOR

import subprocess
import sys, threading, time
import pycurl

def refresh_ip():
	print "Refreshing IP .. ."
	try:
		process = subprocess.Popen('sudo /etc/init.d/tor restart', shell=True, stdout=subprocess.PIPE)
	except Exception, ex:
		print "Failed to Refresh IP. ", ex

# We should ignore SIGPIPE when using pycurl.NOSIGNAL - see
# the libcurl tutorial for more info.
try:
    import signal
    from signal import SIGPIPE, SIG_IGN
    signal.signal(signal.SIGPIPE, signal.SIG_IGN)
except ImportError:
    pass

class Test(threading.Thread):
    def __init__(self, url, ofile):
        threading.Thread.__init__(self)
        self.curl = pycurl.Curl()
        self.curl.setopt(pycurl.URL, url)
        self.curl.setopt(pycurl.WRITEDATA, ofile)
        self.curl.setopt(pycurl.FOLLOWLOCATION, 1)
        self.curl.setopt(pycurl.MAXREDIRS, 5)
        self.curl.setopt(pycurl.NOSIGNAL, 1)
        self.curl.setopt(pycurl.USERAGENT, 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.1 (KHTML, like Gecko) Ubuntu/11.04 Chromium/14.0.835.202 Chrome/14.0.835.202 Safari/535.1')
        self.curl.setopt(pycurl.PROXY, '127.0.0.1:9050')
        self.curl.setopt(pycurl.PROXYTYPE, pycurl.PROXYTYPE_SOCKS5)
        self.curl.setopt(pycurl.REFERER, 'http://www.google.co.in/')

    def run(self):
        self.curl.perform()
        self.curl.close()
        sys.stdout.write(".")
        sys.stdout.flush()

# Read list of URIs from file specified on commandline
try:
    urls = open(sys.argv[1]).readlines()
except IndexError:
    # No file was specified, show usage string
    print "Usage: %s <file with uris to fetch>" % sys.argv[0]
    raise SystemExit

# Initialize thread array and the file number
threads = []

# Start one thread per URI in sequence
fileno = 0
t1 = time.time()
for url in urls:

    f = open(str(fileno), "wb")
    t = Test(url.rstrip(), f)
    t.start()
    fileno = fileno + 1
    t.join()
    f.close()
    refresh_ip()
    time.sleep(3)
t2 = time.time()
print "\n** Singlethreading, %d seconds elapsed for %d uris" % (int(t2-t1), len(urls))

Tagged , , , , , , ,

Automated Process Starter


while true
do
        killall paster
        sleep 10
        paster job_runner --configuration-file machine/ec2/spider/ini/spider.ini --job-name=urlbroker:Starter &
        sleep 14400
done

Extract urls from log

f=open("aggregation.spider.processor.log","r")
count=0
errors=[]
while(True):
	line_feed=f.readline()
	if(line_feed):
		if(line_feed.find("processor-not-found-disabled-url")>-1):
			line_feed = line_feed.split(" url:")
			line_feed = line_feed[1]
			line_feed = line_feed.split(",")
			line_feed = line_feed[0]
			line_feed.strip()
			errors.append(line_feed)
	else:
		break
errors.sort()
for x in errors:
	f=open("url", "a+")
	f.write(x)
	f.close()

Email Reader

Save this file with name imaplib_connect.py Write your user id and password in this file

import imaplib
import ConfigParser
import os

def open_connection(verbose=False):
    # Read the config file
    config = ConfigParser.ConfigParser()
    config.read([os.path.expanduser('~/.pymotw')])

    # Connect to the server
    hostname = "imap.mail.microsoftonline.com"
    if verbose: print 'Connecting to', hostname
    connection = imaplib.IMAP4_SSL(hostname)

    # Login to our account
    username = "XXXXXX@nextag.microsoftonline.com"
    password = "XXXXXX"
    if verbose: print 'Logging in as', username
    connection.login(username, password)
    return connection

if __name__ == '__main__':
    c = open_connection(verbose=True)
    try:
        print c
    finally:
        c.logout()

LXML Parser

Takes an XML file as input and reads all products one by one and store their all data fetched for a particular row in a multi-dimesion array,

from lxml import etree
class Test:
	def parser(self, x):
		pool=[]
		temp = []
		if(len(x)==0):
			try:
				temp = []
				temp = [x.tag, x.text.encode('utf-8')]
				pool.append(temp[:])

			except Exception:
				temp = [x.tag, -1]
				pool.append(temp[:])

		else:
			for y in x:
				self.parser(y)
		pool.sort()
		print pool

	def Starter(self):
		#Input file
		infile="1.xml"
		context = etree.iterparse(infile, events=('end',), tag='product')
		for event, elem in context:
		    print "--- --- ---"
		    print event
		    for x in elem:
			self.parser(x)

		    #print elem.text.encode('utf-8')

		    # It's safe to call clear() here because no descendants will be accessed
		    elem.clear()

		    # Also eliminate now-empty references from the root node to <Title>
		    while elem.getprevious() is not None:
			del elem.getparent()[0]