Category Archives: Python

Showing Exception in detail in python

import sys, os
try:
    raise NotImplementedError("No error")
except Exception, e:
    exc_type, exc_obj, exc_tb = sys.exc_info()
    fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1]      
    print(exc_type, fname, exc_tb.tb_lineno)

python’s Data Structures complexity analysis

Complexity Analysis of General Data Structures for Common Operations:

 

Linked list

Array

Dynamic
array

Balanced
tree

Random access
list

Indexing

Θ(n)

Θ(1)

Θ(1)

Θ(log n)

Θ(log n)

Insert/delete at beginning

Θ(1)

N/A

Θ(n)

Θ(log n)

Θ(1)

Insert/delete at end

Θ(1)

N/A

Θ(1) amortized

Θ(log n)

Θ(log n) updating

Insert/delete in middle

search time +
Θ(1)[1]

N/A

Θ(n)

Θ(log n)

Θ(log n) updating

Wasted space (average)

Θ(n)

0

Θ(n)[2]

Θ(n)

Θ(n)

Complexity Analysis of Python’s Data Structures for Common Operations:

list

Internally, a list is represented as an array

Operation

Average Case

Amortized Worst Case

Copy

O(n)

O(n)

Append[1]

O(1)

O(1)

Insert

O(n)

O(n)

Get Item

O(1)

O(1)

Set Item

O(1)

O(1)

Delete Item

O(n)

O(n)

Iteration

O(n)

O(n)

Get Slice

O(k)

O(k)

Del Slice

O(n)

O(n)

Set Slice

O(k+n)

O(k+n)

Extend[1]

O(k)

O(k)

Sort

O(n log n)

O(n log n)

Multiply

O(nk)

O(nk)

x in s

O(n)

 

min(s), max(s)

O(n)

 

Get Length

O(1)

O(1)

collections.deque

A deque (double-ended queue) is represented internally as a doubly linked list

Operation

Average Case

Amortized Worst Case

Copy

O(n)

O(n)

append

O(1)

O(1)

appendleft

O(1)

O(1)

pop

O(1)

O(1)

popleft

O(1)

O(1)

extend

O(k)

O(k)

extendleft

O(k)

O(k)

rotate

O(k)

O(k)

remove

O(n)

O(n)

dict

 

Operation

Average Case

Amortized Worst Case

Copy[2]

O(n)

O(n)

Get Item

O(1)

O(n)

Set Item[1]

O(1)

O(n)

Delete Item

O(1)

O(n)

Iteration[2]

O(n)

O(n)

Reference:

http://wiki.python.org/moin/TimeComplexity

http://en.wikipedia.org/wiki/Linked_list

Tagged , , , , , , ,

Creating a variable with value of another variable in Python


domain = "google"

vars()[domain] = 'Nextag'

print "Output is", google

Output


Output is NexTag

Zappos Review Parser

def create_hashing_table():
import md5
urls=open(“product_id_with_urls”).readlines()
hashing_table = []
for url in urls:
u = url
url = url.split(“::”)[1]
if(len(url)>1):
file_name = str(md5.new(url.strip()).hexdigest())+”.uss”
hashing_table.append(str(file_name+”:.:”+u))
return hashing_table

def find_date(content):
try:
pattern = ‘<div>’
l = len(pattern)
focus = content[content.find(pattern)+l:content.find(‘<div>’)]
return str(focus[focus.find(‘<p>’)+3:focus.find(‘</p’)]).strip()
except Exception, ex:
return False

def find_review(content):
try:
pattern = ‘<p’
l = len(pattern)
focus = content[content.find(pattern)+l:content.find(‘<p>’)]
return str(focus[focus.find(‘>’)+1:focus.find(‘</p’)]).strip()
except Exception, ex:
return False

def find_author(content):
try:
pattern = ‘<div>’
l = len(pattern)
focus = content[content.find(pattern)+l:content.find(‘<p>’)]
return str(focus[focus.find(‘<h3>’)+4:focus.find(‘</h3’)]).strip()
except Exception, ex:
return False

def find_rating(content):
try:
pattern = ‘<div>’
l = len(pattern)
focus = content[content.find(pattern)+l:content.find(‘<p><strong>Comfort</strong>’)]
return str(int(focus[focus.find(‘Rated:’)+6:focus.find(‘ stars’)])).strip()
except Exception, ex:
return False

def start():
import os
hashing_table = create_hashing_table()
count = 0
success = 0
for filename in os.listdir(’round2′):
for x in hashing_table:
#print x, filename
if(x.find(filename)>-1):
#print “matched”
url = x.split(“:.:”)[1].split(“::”)[1]
product_id = x.split(“:.:”)[1].split(“::”)[0]
if(url.find(“www.zappos.com”)>-1):
if(parse_reviews(filename, url, product_id)):
success+=1
count+=1
print success, “/”, count

def parse_reviews(filename, url, product_id):
html = open(“round2/”+filename).read()
content = html.split(‘<div>’)[1:]
for x in content:
date = find_date(x)
review = find_review(x)
rating = find_rating(x)
author = find_author(x)
if(product_id and url and date and review and rating and author):
try:
create_uml(product_id, url, date, review, rating, author)

except Exception, ex:
print “Error Writing”, filename, ex
return False
else:
print “Error Parsing “, filename, product_id, url, date, review, rating, author
return False
return True

def create_uml(product_id, url, date, review, rating, author):
pattern = product_id+”:::”+url+”:::”+date+”:::”+review+”:::”+rating+”:::”+author+”—>”
#print pattern
f=open(“zappos.uml”,”a+”)
f.write(pattern)
f.close()

start()

Tagged , , , , ,

To check if a page has Bazaar Voice / HReview / Power Review Structure

import os
count = 0
f1=open("shortlisted","a+")
for file in os.listdir('.'):
    f=open(file)
    content = f.read().lower()
    f.close()
    if(content.find('hreview')>-1 or content.find('prreviewwrap')>-1 or content.find('pr-review-wrap')>-1 or content.find('bvstandalonereviewsectionreview')>-1 or content.find('bvrrcontentreview')>-1 ):
	
	text = str("mv crawler_6DEC/"+file+" shortlisted/"+file)+chr(10)
	print text
	f1.write(text)	
	f1.flush()
	count+=1

print count
f1.close() 	

Search a filename in the current directory using pattern matching

import os 
import fnmatch
for file in os.listdir('.'):
    if fnmatch.fnmatch(file, '*.txt'):
        return file
Tagged , , , ,

Domain Extractor (Draft)

import urllib

def get_domain(url):

url = url.split(".")

tlds = urllib.urlopen("http://data.iana.org/TLD/tlds-alpha-by-domain.txt").readlines()

for x in range(0, len(url)):

try:

xx=url[x].split("/")

for y in xx:

s = str()

if(tlds.find(y)>-1):

return url[x-1]

except Exception, ex:

print x, ex

continue