Monthly Archives: October 2011

Email Reader

Save this file with name imaplib_connect.py Write your user id and password in this file

import imaplib
import ConfigParser
import os

def open_connection(verbose=False):
    # Read the config file
    config = ConfigParser.ConfigParser()
    config.read([os.path.expanduser('~/.pymotw')])

    # Connect to the server
    hostname = "imap.mail.microsoftonline.com"
    if verbose: print 'Connecting to', hostname
    connection = imaplib.IMAP4_SSL(hostname)

    # Login to our account
    username = "XXXXXX@nextag.microsoftonline.com"
    password = "XXXXXX"
    if verbose: print 'Logging in as', username
    connection.login(username, password)
    return connection

if __name__ == '__main__':
    c = open_connection(verbose=True)
    try:
        print c
    finally:
        c.logout()

LXML Parser

Takes an XML file as input and reads all products one by one and store their all data fetched for a particular row in a multi-dimesion array,

from lxml import etree
class Test:
	def parser(self, x):
		pool=[]
		temp = []
		if(len(x)==0):
			try:
				temp = []
				temp = [x.tag, x.text.encode('utf-8')]
				pool.append(temp[:])

			except Exception:
				temp = [x.tag, -1]
				pool.append(temp[:])

		else:
			for y in x:
				self.parser(y)
		pool.sort()
		print pool

	def Starter(self):
		#Input file
		infile="1.xml"
		context = etree.iterparse(infile, events=('end',), tag='product')
		for event, elem in context:
		    print "--- --- ---"
		    print event
		    for x in elem:
			self.parser(x)

		    #print elem.text.encode('utf-8')

		    # It's safe to call clear() here because no descendants will be accessed
		    elem.clear()

		    # Also eliminate now-empty references from the root node to <Title>
		    while elem.getprevious() is not None:
			del elem.getparent()[0]


SAX Parser for large XML files

<pre>from lxml import etree
def abc(x):
	if(len(x)==0):
		try:
			print "--- --- --- --- --- --- "
			print x.tag
			if(len(x.text.encode('utf-8').strip())>0):
				print x.text.encode('utf-8')
			else:
				print "No Data"
		except Exception:
			print "ERROR", x.tag
	else:
		for y in x:
			abc(y)

	
infile="1.xml"
context = etree.iterparse(infile, events=('end',), tag='product')
for event, elem in context:
    print "--- --- ---"
    for x in elem:
	abc(x)
	
		
    #print elem.text.encode('utf-8')

    # It's safe to call clear() here because no descendants will be accessed
    elem.clear()

    # Also eliminate now-empty references from the root node to <Title> 
    while elem.getprevious() is not None:
        del elem.getparent()[0]





</pre>

MySQL- Remove Duplicate Rows

Add all data into a new table using Group By Clause
 create table temp select * from sources group by name;


Script to extract domains urls from email (Source Not found error)

script command: 

python dynurl.py beta-missing.txt "url=" "," a.txt
TOOD: make a general file extract
import sys
input_filename=sys.argv[1]
start_pattern=sys.argv[2]
end_pattern=sys.argv[3]
output_filename=sys.argv[4]
f=open(input_filename, "r")
f1=open(output_filename,"a+")
d=[]
while(True):
	line_feed=f.readline()
	if(line_feed):
		if(line_feed.find(start_pattern)>-1):
			content=line_feed[line_feed.find(start_pattern)+len(start_pattern):]
			content=content[:content.find(end_pattern)]
			domain=content[content.find(".")+1:]
			KNOWN_TLDS = ('com', 'biz', 'ca', 'info', 'net', 'org', 'uk', 'us')
			for x in KNOWN_TLDS:
				if(domain.find(x)>-1):
					domain=domain.split(x)
					ps="www."+domain[0]+x
					d.append(ps)
					f1.write(ps)
					break

			else:
				print "Unknown TLDs"

	else:
		break
d.sort()
d = list(set(d))
for x in d:
	print x
f.close()
f1.close()

split xml file into products

<code>
f=open(filename,"a+")
            while(True):
                line_feed=f.readline()
                start_line="<?xml version="1.0" encoding="utf-8" ?><root>"
                end_line="<metadata><date>Tue Sep 27 02:19:53 PDT 2011</date><dump_type>full_set</dump_type><product_count>98968</product_count><tag_count>966399</tag_count></metadata>"
                content=start_line
                if(line_feed):
                    if(line_feed.find("<product product_number=")>-1):
                        content+=line_feed
                        if(line_feed.find("</product>")):
                            content+=end_line
                            ff=open("temp.importer","w")
                            ff.write(content)
                            try:
                                import_nextag_products.import_products(temp.importer, steps_flags)
                            except Exception, exception:
                                log.exception('%s: %s, options: %s', exception.__class__.__name__, exception, self.options)
                                raise
                            finally:
                                if import_nextag_products is not None:
                                    log.info("Done. File: %s, Total: %s, Processed: %d, Skipped: %s, New products: %s, Updated: %s.  Options: %s",
                                             filename,
                                             import_nextag_products.total_processed_count + import_nextag_products.total_skipped_count,
                                             import_nextag_products.total_processed_count,
                                             import_nextag_products.total_skipped_count,
                                             import_nextag_products.new_product_count,
                                             import_nextag_products.product_update_count,
                                             self.options)
                                else:
                                    log.info('Done.  Options: %s', self.options)
                else:
                    log.info("File reading complete")
                    break

</code>

Code: Encode HTML

<pre>'''htmlCodes = [
    ['&', '&amp;'],
    ['<', '&lt;'],
    ['>', '&gt;'],
    ['"', '&quot;'],
    [' ', '&nbsp;']
]
htmlCodesReversed = htmlCodes[:]
htmlCodesReversed.reverse()
def htmlDecode(s, codes=htmlCodesReversed):
    """ Returns the ASCII decoded version of the given HTML string. This does
        NOT remove normal HTML tags like <p>. It is the inverse of htmlEncode()."""
    for code in codes:
        s = s.replace(code[1], code[0])
    return s
'''

</pre>

Execute before importing XML file

<pre>k=open("2.25k","r")
kk=open("8.xml","a+")
t=k.readlines()
for x in t:
kk.write(x.replace('<price_range></price_range>','<price_range>0</price_range>'))

k=open("8.xml","r")
kk=open("last12k.xml","a+")
t=k.readlines()
for x in t:
        kk.write(x.replace(" to&amp;nbsp;"," to "))</pre>

Save MYSQL into DB

 mysql -uroot spider -e"select url, rdomain, product_id, metadata from url_queue where  rdomain ='uk.co.amazon.www'">>amazonurls.txt

blah blah blah

%s/'#org_wize_crawl@nextag.com'/'Jayant.Yadav@nextag.com', 'Utsav.Sabharwal@nextag.com'


It would be an honor as well as a pleasure for me to work on this project. For technical approach and past experience of mine, check PM.

About Project: 

Okies, this is what can be done:
I develop a Python script(or whtever u want) which takes all domains you give (in whatever way you want) and this script fetch its WHOIS information most importantly admin email id and expiration date etc and provide all this information readily available to you (in whatever format you want). We can make it a csv which can be imported in your outlook etc or we can develop a tool mail merge this automatically. I wont recommend bulk emailing as its gonna make your email sound like spam but if you want no problem can be done.



About Me:
I am working in NexTag as a web crawler specialist. I have developed multithreaded scalable crawlers which can crawl like 10000 pages in 300 secs and I have complete understanding of crawlers of Google, Yahoo etc and the researches going in this field in MITs and Stanford etc.

My cost is genuine and affordable while I assure you quality of service.