Monthly Archives: August 2011

Using Mysqldb rather than sqlalchemy


import MySQLdb
import sys
import time
times=time.time()
try:
    conn = MySQLdb.connect (host = "spider.db", user = "root", passwd = "", db = "spider")
except MySQLdb.Error, e:
    print "Error %d: %s" % (e.args[0], e.args[1])
    sys.exit (1)
print (time.time()-times), ": Connect to MySql"
rdomains=[]
db = conn.cursor ()
q = "SELECT DISTINCT url_queue.rdomain FROM url_queue WHERE url_queue.is_disabled =0"
print (time.time()-times), ": Initializing MySQL Query"
db.execute(q)
print (time.time()-times), ": The query has executed successfully. \n Now, fetching data to rdomain" 
rdomains=db.fetchall()
print (time.time()-times), ": Data has been fetched and saved to rdomains! :)"

Advertisements

Getting the value of href attributes in all tags on a html file with Python

</p> <p><code>from BeautifulSoup import BeautifulSoup as soup<br>html = soup('&lt;body&gt;&lt;a href="123"&gt;qwe&lt;/a&gt;&lt;a href="456"&gt;asd&lt;/a&gt;&lt;/body&gt;')<br>print [tag.attrMap['href'] for tag in html.findAll('a', {'href': True})]</code></p> <p><code></code>

A1.py: Code for converting Category XML file to Data in Wize2 db @ categories

This code fetched data from category.xml and category name. parent_id and next node id in category table of wize2 database.

import urllib
from collections import defaultdict
import sys
import time
from locale import LC_ALL, format_string, setlocale
from pylons import config
from sqlalchemy import text
from veetwo.models import db
import logging
import MySQLdb
import string
import sys



def fetchdatafromXML():
        
        count=0;
        handler = open("category.xml", "r")
        content = handler.read()
        tempnode = content
        while(int(tempnode.find("</node>"))!=-1):
            nodeid = tempnode[int(tempnode.find("<nodeid>"))+8:int(tempnode.find("</nodeid>"))]
            parent_id = tempnode[int(tempnode.find("<parentid>"))+10:int(tempnode.find("</parentid>"))]
            name = tempnode[int(tempnode.find("<name>"))+6:int(tempnode.find("</name>"))]
            tempnode = tempnode[int(tempnode.find("</node>"))+1:]
            nextag_node_id=nodeid
            name=urllib.quote(name)
            id=nodeid
            sql = '''insert into categories(id, name, parent_id, is_live, nextag_node_id) values (%(id)s,'%(name)s', %(parent_id)s, 'YES',%(nextag_node_id)s)''' %dict(id=id, name=name, parent_id=parent_id, nextag_node_id=nextag_node_id) 
            db.wize.execute(sql)
            print count, ": ", sql
            count+=1

if __name__ == u'__main__':
    # for testing
    from contextlib import nested
    from veetwo.models import feed, site, spider, wize
    from veetwo.lib.utils.command import load_configuration
    config_file = sys.argv[1]
    config.push_process_config(load_configuration(config_file))
    for schema in (feed, site, spider, wize):
        dsn = config[u'wize.' + schema.__name__[(len(schema.__package__)) + 1:] + u'_dsn']
        schema.init(dsn)
    run_queries = [fetchdatafromXML]
    with nested(site.Session(), wize.Session()):    
        for run_query in run_queries:
            run_query()
            sys.stdout.flush()    

Convert category temp file to XML

Code for converting temp file created in last post to XML

import os
def searchforpapa(papa):
    x = open("temp.utsav", "r")
    papa = ': '+papa
    while(x.readline()):
        content=x.readline()
        if(content.find(papa)!=-1):
            return int(content[:int(content.find(':'))])
        else:
            return -1    
            
        
    
a = open("temp.utsav", "r")
b = open ("t.utsav", 'a')
id=1
handler = open("category.xml", "a")
handler.write("<root>")
handler.close()
xx=a.readlines()
for content in xx:
    for x in content: 
        if(x=='Providence'): 
            print x
            break
        if(x==':'):
            nodeid = int(content[:int(content.find(x))])
            name = content[int(content.find(x))+2:]
            parent = content[int(content.find(x))+2:]
            while (int(name.find("/"))!=-1):
                name=name[int(name.find("/"))+1:]
            
            parent=parent[:int(parent.find(name))]
            parent=parent[:len(parent)-1]
            parentid=searchforpapa(parent)
            nodeid=str(nodeid)
            nodeid=nodeid.strip()
            parentid=str(parentid)
            parentid=parentid.strip()
            name=str(name)
            name=name.strip()
            id=str(id)
            xml="<id>"+id+"</id><node><nodeid>"+nodeid+"</nodeid><parentid>"+parentid+"</parentid><name>"+name+"</name></node>"
            id=int(id)
            id+=1
            print id
            xml=str(xml) 
            handler = open("category.xml", "a")
            handler.write(xml)
            handler.close()
            

    
handler = open("category.xml", "a")
handler.write("</root>")
handler.close()


Fetch all nodes from Nextag

this code shall fetch all US category name and nodes from http://b6.pv.sv.nextag.com/serv/main/buyer/BulkCategoryCodes.jsp


import urllib2

def fetchdata(nodeid):
    url="http://b6.pv.sv.nextag.com/serv/main/buyer/BulkCategoryListing.jsp?node="+nodeid+"&all=y"
    print "fetching data for node id(", nodeid, ")" 
    f = urllib2.urlopen(url)
    content=f.read()
    content=content[int(content.find('<pre>'))+7:int(content.find('</pre>'))]
    removespaces(content)

def removespaces(content):
    
    for x in content:
        if(x!=' ' and x!=chr(10)):
            content=content[int(content.find(x)):]
            break
    content=content[::-1]
    for x in content:
        if(x!=' ' and x!=chr(10)):
            content=content[int(content.find(x)):]
            break
    content=content[::-1]
    print "storing to database"
    f = open("temp.utsav", "a")
    f.write(content)
    f.close()
 
url="http://b6.pv.sv.nextag.com/serv/main/buyer/BulkCategoryCodes.jsp"
f = urllib2.urlopen(url)
content=f.read()
temp="<td>MAIN channel</td>"
start=int(content.find(temp))
temp="td>UK channel</td>"
end=int(content.find(temp))
content=content[start:end]
while(content.find('<option value="')!=-1):
    content=content[int(content.find('<option value="')):]
    temp=content[15: ]
    nodeid=temp[:int(temp.find('"'))]
    temp=temp[int(temp.find('"'))+3:]
    if(temp.find('<option value="')==-1):
        name=temp[:int(temp.find('</select>'))]
    else:
        name=temp[:int(temp.find('<option value="'))]
    content = content[15: ]
    fetchdata(nodeid)
    



fetching data from a file

def query_reviews():
    f = open("/home/tes/workspace/tests/src/1.xls", "r")
    c=f.readlines()
    print c[0]
    print c[1]
    nodeid=""
    name=""
    parent_id=""
    text=""
    paa=""
    x=1
    y=0
    z=0
    id=1
    while(x<len(c)):
        y=0
        while(y<len(c[x])):
            if(c[x][y]=='"'):
                z=y+1
                while(z<len(c[x])):
                    if(c[x][z]=='"'):
                        for kk in range(y+1,z):
                            text+=c[x][kk]
                        y=z+1
                        z=len(c[x])-1
                        
                    
                    if(nodeid=='' and z==len(c[x])-1):
                        nodeid=text
                        text=""
                        
                    elif(name=="" and z==len(c[x])-1 and nodeid!=''):
                        name=text
                        text=""
                        
                    elif(parent_id=="" and z==len(c[x])-1 and nodeid!='' and name!=''):
                        parent_id=text
                        
                        for kk in range((parent_id.find('>')+1), parent_id.find('</a>')):
                            paa+=parent_id[kk]
                        parent_id=paa
                        paa=""
                        print nodeid, name, parent_id
                        nextag_node_id=nodeid
                        is_live='YES'
                        okies={'id':'id','name':'name', 'parent_id_id':'parent_id', 'is_live':'is_live', 'nextag_node_id':'nextag_node_id'}
                        sql = ('''insert into categories(id, name, parent_id, is_live, nextag_node_id, last_modified) values (%(id)s, %(name)s, %(parent_id)s, %(is_live)s, %(nextag_node_id)s, %(last_modified)s)''', okies)
                        db.wize.execute(sql).scalar()
                        id+=1 
                        nodeid=""
                        name=""
                        parent_id=""
                        text=""
                        y=len(c[x])-1
                    else:
                        asd=0
                    z=z+1    
            y=y+1       
        x=x+1   

NT Db Mapping

categories (id) -> attributes (category_id)
categories(id) -> category_datapoints (category_id)
categories(id) -> navigation (category_id)
categories(id) -> pricelist_prices(category_id)
categories(id) -> products (category_id)
categories(id) -> brand_category (category_id)

products(id) -> product_datapoints (product_id)
products(id) -> product_keys (product_id)
products(id) -> reviews (product_id)
products(id) -> price_anomalies (product_id)
products(id) -> source_avg_ratings (product_id)
products(id) -> review_key_phrases (product_id)
products(id) -> review_key_terms (product_id)
products(id) -> product_content (product_id)
products(id) -> search_terms (product_id)
products(id) -> url_related_product (product_id)
search_terms (id) -> incoming_search_terms (search_term_id)
search_terms (id) -> search_term_blacklist (search_term_id)
search_terms (id) -> url_related_search_term (search_term_id)

navigation(id) -> link_mappings(navigation_id)
navigation(id) -> navigation_hierarchy (child_id)

reviews(id) -> related_product_snippets(review_id)
reviews(id) -> snippets(review_id)

attributes(id) -> product _attributes(attribute_id)

NT Db BACKUP NAMESPACE

dc<— categories
da <– attributes (33215 rows)
dpa <– product_attributes
dcd <– category_datapoints
dn <– navigation
dlm <– link_mappings
dnh <– navigation_hierarch
dpp <– pricelist_prices
dp <– products
dpd <– product_datapoints
dpk <– product_keys
dr <– reviews (only structure no data)
drps <– related_product_snippets
ds <– snippets
dpa2 <– price_anomalies
dsar <– source_avg_ratings
drkp <– review_key_phrases
drkt <– review_key_terms
dpc <– product_content
dst <– search_terms
dist <– incoming_search_terms
dstb <– search_term_blacklist
durst <– url_related_search_term (only structure) 11.40am
dbc<– brand_category
no backup 😥 <– url_related_product

inserting variable in mysql @ Python

sql=”’ %(utsav)s ”’ %dict(utsav:utsav)

example:

sql = “””
SELECT COALESCE(trp.relation, ‘TOTAL’) committed,
COUNT(DISTINCT trp.product_id) num_products
FROM %(wize_db)s.topic_related_product trp
JOIN topics t ON t.id = trp.topic_id
JOIN tmp_live_categories lc ON lc.id = t.category_id
WHERE t.is_live = ‘YES’
AND trp.relation != ‘DISABLED’
AND trp.score > 0
GROUP BY trp.relation
WITH ROLLUP
“”” % dict(wize_db=db.wize.bind.url.database)

imdb cara list to films table


import MySQLdb
import sys
try:
conn = MySQLdb.connect (host = "SQL09.FREEMYSQL.NET",
user = "utsavsabharwal",
passwd = "primanuts",
db = "moviebook")
except MySQLdb.Error, e:
print "Error %d: %s" % (e.args[0], e.args[1])
sys.exit (1)
cursor = conn.cursor ()
uid=0


f = open("/home/tes/workspace/tests/src/mpaa-ratings-reasons.list", "r")
c=f.readlines()

year=""
nameoffilm=""
for x in range(0, len(c)-1):
if(c[x][0]=='M' and c[x][1]=='V'):
for y in range(0, len(c[x])-1):
if(c[x][y]=='('):
for z in range(y+1,y+5):
year+=c[x][z]
for z in range(3,y):
nameoffilm+=c[x][z]
print uid
print year
print nameoffilm
uid=uid+1
year=""
nameoffilm=""
okies={'uid':'uid','name':'name','year':'year'}
cursor.execute(u'''insert into films (uid, name, year) values(%(uid)s, %(name)s, %(year)s)''', okies)
break