Wednesday, July 20, 2011

Using real data to test

For the past two weeks or so I've been working on an idea for the micro-vendor space. As part of that project I needed to build a database of products which could be typically found at such stores. For this version of the script I only looked at Makro and Woolworths, unfortunately Pick 'n Pay just looked horrible. As a first run I used the following script (a django management command):
from django.core.management.base import BaseCommand, CommandError

from BeautifulSoup import BeautifulSoup
import urllib2, re, json

from decimal import *

WHOLESALERS = {
  "Makro" : {
    "Groceries": {
      "Carbonated Soft Drinks" : "http://www.makro.co.za/live/content.php?SortBy=1&ItemsPerPage=9999&Region=1&Action=catalog&Cat=58&Gifts=&catId=&Start=0&Images=0&Query=&ShowAll=1&Brand=&Extended=&Reduced=&Promo=&Session_ID=f9850a12c1942df9c77866b3bbf22654",
      "Confectionary & Beverage" : {
        "Snack" : "http://www.makro.co.za/live/content.php?SortBy=1&ItemsPerPage=9999&Region=1&Action=catalog&Cat=82&Gifts=&catId=&Start=0&Images=0&Query=&ShowAll=&Brand=&Extended=&Reduced=&Promo=&Session_ID=f9850a12c1942df9c77866b3bbf22654",
        "Confectionery": "http://www.makro.co.za/live/content.php?SortBy=1&ItemsPerPage=9999&Region=1&Action=catalog&Cat=84&Gifts=&catId=&Start=0&Images=0&Query=&ShowAll=1&Brand=&Extended=&Reduced=&Promo=&Session_ID=f9850a12c1942df9c77866b3bbf22654",
      }
    }
  },
  "Woolworths" : {
    "Food & Household" : {
      "Beverages" : {
        "Carbonated Drinks" : {
          "Cans" : "http://www.woolworths.co.za/store/browse/category.jsp?q_docSort=&categoryId=cat420030&addFacet=9004%3Acat420030&howMany=99999&q_pageNum=1&viewAll=false",
        }
      },
      "Snacks, Sweets & Biscuits" : {
        "Chips & Other Snacks" : {
          "Chips / Crisps" : "http://www.woolworths.co.za/store/browse/category.jsp?q_docSort=&categoryId=cat420218&addFacet=9004%3Acat420218&howMany=99999&q_pageNum=1&viewAll=false",
          "Snack Bars": "http://www.woolworths.co.za/store/browse/category.jsp?q_docSort=&categoryId=cat420220&addFacet=9004%3Acat420220&howMany=99999&q_pageNum=1&viewAll=false",
        },
        "Chocolate Bars & Boxes" : {
          "Boxes" : "http://www.woolworths.co.za/store/browse/category.jsp?q_docSort=&categoryId=cat420226&addFacet=9004%3Acat420226&howMany=99999&q_pageNum=1&viewAll=false",
          "Chocolate Bars" : "http://www.woolworths.co.za/store/browse/category.jsp?q_docSort=&categoryId=cat420224&addFacet=9004%3Acat420224&howMany=99999&q_pageNum=1&viewAll=false",
        },
        "Dried Fruit" : "http://www.woolworths.co.za/store/browse/category.jsp?q_docSort=&categoryId=cat200032&addFacet=9004%3Acat200032&howMany=99999&q_pageNum=1&viewAll=false",
        "Nuts" : "http://www.woolworths.co.za/store/browse/category.jsp?q_docSort=&categoryId=cat200026&addFacet=9004%3Acat200026&howMany=99999&q_pageNum=1&viewAll=false",
        "Popcorn": "http://www.woolworths.co.za/store/browse/category.jsp?q_docSort=&categoryId=cat200024&addFacet=9004%3Acat200024&howMany=99999&q_pageNum=1&viewAll=false",
      },
    }
  }
}

class Command(BaseCommand):
  args = '<output_file>'
  help = 'This command generates a json file which will eventually turn into a fixture file for import into database'

  def __init__(self, *args, **kwargs):
    super(Command, self).__init__(*args, **kwargs)
    self.opener = urllib2.build_opener()
    self.opener.addheaders = [('User-agent', 'Mozilla/5.0')]

  def parseMakroPage(self, url):
    soup = None
    import httplib
    while soup is None:
      # For some reason I kept getting IncompleteRead errors, this fixed it.
      try:
        page = self.opener.open(url)
        soup = BeautifulSoup(page.read())
        page.close()
      except (httplib.IncompleteRead, httplib.BadStatusLine), err:
        from time import sleep
        print "Read error occurred, sleeping for 1s then I will try again!"
        sleep(1)
    products = soup.findAll('table', attrs = { "background" : "/live/images/product_back.gif"})
    suffix = "http://www.makro.co.za"
    data = []
    for product in products:
      try:
        brand = product.find(attrs={'class' : 'style4'}).contents[0]
        # At least one product data entry is broken
        if len(brand) > 0:
          brand = brand.contents[0].strip()
        else:
          brand = ""
        variation = product.find(attrs={'class' : 'style4'}).contents[1].strip()
        sku = str(Decimal(product.find(attrs={"class" : "style20"})['href'].split('Sku=')[1].split('|')[0]))
        product_id = product.find(attrs={"class" : "style20"})['href'].split('ProdId=')[1].split('&')[0]
        link = "%s/%s" % (suffix, product.find(attrs={'class' : 'style4'})['href'].split('&')[0][1:])
        price = product.find(attrs={'class' : 'style5'}).contents[0].strip().split(' ')[1].strip()
      except IndexError:
        import pdb
        pdb.set_trace()
      price = str(Decimal(price).quantize(Decimal('.01'), rounding=ROUND_DOWN))
      print "%s [%s]:%s - %s R %s (%s)" % (product_id, sku, brand, variation, price, link) 
      data.append({
        'brand' : brand,
        'variation' : variation,
        'sku' : sku,
        'product_id' : product_id,
        'link' : link,
        'price' : price,
      })
    return data
      
  def parseWoolworthsPage(self, url):
    page = self.opener.open(url)
    soup = BeautifulSoup(page.read())
    page.close()
    products = soup.findAll('div', attrs = { "class" : "itemcontainerWW" })
    suffix = "http://www.woolworths.co.za"
    data = []
    for product in products:
      name = product.find(attrs = { "class" : "itemheader" }).a.contents[0].strip()
      link = "%s/%s" % (suffix, product.find(attrs = { "class" : "itemheader" }).a['href'][1:])
      product_id = link.split('=')[1]
      price = product.find(attrs = { "class" : "itemprice_strike" }).contents[0].strip().split(' ')[1].strip()
      price = str(Decimal(price).quantize(Decimal('.01'), rounding=ROUND_DOWN))
      print "%s: %s R %s (%s)" % (product_id, name, price, link) 
      data.append({
        'name' : name,
        'link' : link,
        'product_id' : product_id,
        'price' : price,
      })
    return data

  def recurseWholesalers(self, obj, parse_callback, categories=[], products = []):
    if isinstance(obj, dict):
      for k in obj.keys():
        new_categories = list(categories)
        new_categories.append(k)
        self.recurseWholesalers(obj[k], parse_callback, new_categories, products)
    else:
      newProducts = parse_callback(obj)
      for product in newProducts:
        product['categories'] = categories
      products.extend(newProducts)

  def handle(self, *args, **options):
    if len(args) == 1:
      woolworthsProducts = []
      makroProducts = []
      self.recurseWholesalers(WHOLESALERS['Woolworths'], self.parseWoolworthsPage, [], woolworthsProducts)
      self.recurseWholesalers(WHOLESALERS['Makro'], self.parseMakroPage, [], makroProducts)
      products = {
        'Woolworths': woolworthsProducts,
        'Makro': makroProducts,
      }
      filename = args[0]
      with open(filename, mode='w') as f: 
        json.dump(products, f, indent=2)
    else:
      print "You need to specify the output filename"

It was a fun exercise and perhaps it will help someone out there. There are still a couple of difficulties such as identifying the same products at different wholesalers, handling product variations, e.g. flavour, size, etc.

2 comments:

  1. Useful information shared. I am very happy to read this article. Thanks for giving us nice info. Fantastic walk through. I appreciate this post.
    Laptops for sale

    ReplyDelete
  2. Trade Stocks, Forex, And Bitcoin Anywhere In The World:exness login Is The Leading Provider Of Software That Allows You To Trade On Your Own Terms. Whether You Are Operating In The Forex, Stock, cgin Software And Anonymous Digital Wallet To Connect With The Financial World.: exness login Is A Currency Trading Company That Allows You To Trade Stocks, Forex, And Cryptocurrency.

    ReplyDelete