Tag Archives: python

python’s Data Structures complexity analysis

Complexity Analysis of General Data Structures for Common Operations:

 

Linked list

Array

Dynamic
array

Balanced
tree

Random access
list

Indexing

Θ(n)

Θ(1)

Θ(1)

Θ(log n)

Θ(log n)

Insert/delete at beginning

Θ(1)

N/A

Θ(n)

Θ(log n)

Θ(1)

Insert/delete at end

Θ(1)

N/A

Θ(1) amortized

Θ(log n)

Θ(log n) updating

Insert/delete in middle

search time +
Θ(1)[1]

N/A

Θ(n)

Θ(log n)

Θ(log n) updating

Wasted space (average)

Θ(n)

0

Θ(n)[2]

Θ(n)

Θ(n)

Complexity Analysis of Python’s Data Structures for Common Operations:

list

Internally, a list is represented as an array

Operation

Average Case

Amortized Worst Case

Copy

O(n)

O(n)

Append[1]

O(1)

O(1)

Insert

O(n)

O(n)

Get Item

O(1)

O(1)

Set Item

O(1)

O(1)

Delete Item

O(n)

O(n)

Iteration

O(n)

O(n)

Get Slice

O(k)

O(k)

Del Slice

O(n)

O(n)

Set Slice

O(k+n)

O(k+n)

Extend[1]

O(k)

O(k)

Sort

O(n log n)

O(n log n)

Multiply

O(nk)

O(nk)

x in s

O(n)

 

min(s), max(s)

O(n)

 

Get Length

O(1)

O(1)

collections.deque

A deque (double-ended queue) is represented internally as a doubly linked list

Operation

Average Case

Amortized Worst Case

Copy

O(n)

O(n)

append

O(1)

O(1)

appendleft

O(1)

O(1)

pop

O(1)

O(1)

popleft

O(1)

O(1)

extend

O(k)

O(k)

extendleft

O(k)

O(k)

rotate

O(k)

O(k)

remove

O(n)

O(n)

dict

 

Operation

Average Case

Amortized Worst Case

Copy[2]

O(n)

O(n)

Get Item

O(1)

O(n)

Set Item[1]

O(1)

O(n)

Delete Item

O(1)

O(n)

Iteration[2]

O(n)

O(n)

Reference:

http://wiki.python.org/moin/TimeComplexity

http://en.wikipedia.org/wiki/Linked_list

Tagged , , , , , , ,

Increasing number of threads per process

There is no direct limit on number of threads a process can have. Rather, this is calculated using following formula:

number of threads = total virtual memory / (stack size*1024*1024)

Thus, the number of threads per process can by increasing total virtual memory or by decreasing stack size. Decreasing stack size can lead to code failure due to stack overflow while max virtual memory is equal to the swap memory.

Check you machine:

Total Virtual Memory: ulimit -v (default is unlimited, thus you need to increase swap memory to increase this)

Total Stack Size: ulimit -s (default is 8Mb)

Command to increase these values:

ulimit -s newvalue

ulimit -v newvalue

replace new value with the value you want to put as limit.

How to increase swap memory?

http://www.ehow.com/how_5001512_increase-virtual-memory-linux.html

Tagged , , , , , , , , , , , , , , ,

Zappos Review Parser

def create_hashing_table():
import md5
urls=open(“product_id_with_urls”).readlines()
hashing_table = []
for url in urls:
u = url
url = url.split(“::”)[1]
if(len(url)>1):
file_name = str(md5.new(url.strip()).hexdigest())+”.uss”
hashing_table.append(str(file_name+”:.:”+u))
return hashing_table

def find_date(content):
try:
pattern = ‘<div>’
l = len(pattern)
focus = content[content.find(pattern)+l:content.find(‘<div>’)]
return str(focus[focus.find(‘<p>’)+3:focus.find(‘</p’)]).strip()
except Exception, ex:
return False

def find_review(content):
try:
pattern = ‘<p’
l = len(pattern)
focus = content[content.find(pattern)+l:content.find(‘<p>’)]
return str(focus[focus.find(‘>’)+1:focus.find(‘</p’)]).strip()
except Exception, ex:
return False

def find_author(content):
try:
pattern = ‘<div>’
l = len(pattern)
focus = content[content.find(pattern)+l:content.find(‘<p>’)]
return str(focus[focus.find(‘<h3>’)+4:focus.find(‘</h3’)]).strip()
except Exception, ex:
return False

def find_rating(content):
try:
pattern = ‘<div>’
l = len(pattern)
focus = content[content.find(pattern)+l:content.find(‘<p><strong>Comfort</strong>’)]
return str(int(focus[focus.find(‘Rated:’)+6:focus.find(‘ stars’)])).strip()
except Exception, ex:
return False

def start():
import os
hashing_table = create_hashing_table()
count = 0
success = 0
for filename in os.listdir(’round2′):
for x in hashing_table:
#print x, filename
if(x.find(filename)>-1):
#print “matched”
url = x.split(“:.:”)[1].split(“::”)[1]
product_id = x.split(“:.:”)[1].split(“::”)[0]
if(url.find(“www.zappos.com”)>-1):
if(parse_reviews(filename, url, product_id)):
success+=1
count+=1
print success, “/”, count

def parse_reviews(filename, url, product_id):
html = open(“round2/”+filename).read()
content = html.split(‘<div>’)[1:]
for x in content:
date = find_date(x)
review = find_review(x)
rating = find_rating(x)
author = find_author(x)
if(product_id and url and date and review and rating and author):
try:
create_uml(product_id, url, date, review, rating, author)

except Exception, ex:
print “Error Writing”, filename, ex
return False
else:
print “Error Parsing “, filename, product_id, url, date, review, rating, author
return False
return True

def create_uml(product_id, url, date, review, rating, author):
pattern = product_id+”:::”+url+”:::”+date+”:::”+review+”:::”+rating+”:::”+author+”—>”
#print pattern
f=open(“zappos.uml”,”a+”)
f.write(pattern)
f.close()

start()

Tagged , , , , ,

Search a filename in the current directory using pattern matching

import os 
import fnmatch
for file in os.listdir('.'):
    if fnmatch.fnmatch(file, '*.txt'):
        return file
Tagged , , , ,

Curl Single Threaded Crawler using TOR

import subprocess
import sys, threading, time
import pycurl

def refresh_ip():
	print "Refreshing IP .. ."
	try:
		process = subprocess.Popen('sudo /etc/init.d/tor restart', shell=True, stdout=subprocess.PIPE)
	except Exception, ex:
		print "Failed to Refresh IP. ", ex

# We should ignore SIGPIPE when using pycurl.NOSIGNAL - see
# the libcurl tutorial for more info.
try:
    import signal
    from signal import SIGPIPE, SIG_IGN
    signal.signal(signal.SIGPIPE, signal.SIG_IGN)
except ImportError:
    pass

class Test(threading.Thread):
    def __init__(self, url, ofile):
        threading.Thread.__init__(self)
        self.curl = pycurl.Curl()
        self.curl.setopt(pycurl.URL, url)
        self.curl.setopt(pycurl.WRITEDATA, ofile)
        self.curl.setopt(pycurl.FOLLOWLOCATION, 1)
        self.curl.setopt(pycurl.MAXREDIRS, 5)
        self.curl.setopt(pycurl.NOSIGNAL, 1)
        self.curl.setopt(pycurl.USERAGENT, 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.1 (KHTML, like Gecko) Ubuntu/11.04 Chromium/14.0.835.202 Chrome/14.0.835.202 Safari/535.1')
        self.curl.setopt(pycurl.PROXY, '127.0.0.1:9050')
        self.curl.setopt(pycurl.PROXYTYPE, pycurl.PROXYTYPE_SOCKS5)
        self.curl.setopt(pycurl.REFERER, 'http://www.google.co.in/')

    def run(self):
        self.curl.perform()
        self.curl.close()
        sys.stdout.write(".")
        sys.stdout.flush()

# Read list of URIs from file specified on commandline
try:
    urls = open(sys.argv[1]).readlines()
except IndexError:
    # No file was specified, show usage string
    print "Usage: %s <file with uris to fetch>" % sys.argv[0]
    raise SystemExit

# Initialize thread array and the file number
threads = []

# Start one thread per URI in sequence
fileno = 0
t1 = time.time()
for url in urls:

    f = open(str(fileno), "wb")
    t = Test(url.rstrip(), f)
    t.start()
    fileno = fileno + 1
    t.join()
    f.close()
    refresh_ip()
    time.sleep(3)
t2 = time.time()
print "\n** Singlethreading, %d seconds elapsed for %d uris" % (int(t2-t1), len(urls))

Tagged , , , , , , ,