本文实例讲述了python实现爬取千万淘宝商品的方法。分享给大家供大家参考。具体实现方法如下:
import timeimport leveldbfrom urllib.parse import quote_plus import reimport jsonimport itertoolsimport sysimport requestsfrom queue import queuefrom threading import threadurl_base = 'http://s.m.taobao.com/search?q={}&n=200&m=api4h5&style=list&page={}'def url_get(url): # print('get ' + url) header = dict() header['accept'] = 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8' header['accept-encoding'] = 'gzip,deflate,sdch' header['accept-language'] = 'en-us,en;q=0.8' header['connection'] = 'keep-alive' header['dnt'] = '1' #header['user-agent'] = 'mozilla/5.0 (macintosh; intel mac os x 10_8_0) applewebkit/537.36 (khtml, like gecko) chrome/28.0.1500.71 safari/537.36' header['user-agent'] = 'mozilla/12.0 (compatible; msie 8.0; windows nt)' return requests.get(url, timeout = 5, headers = header).textdef item_thread(cate_queue, db_cate, db_item): while true: try: cate = cate_queue.get() post_exist = true try: state = db_cate.get(cate.encode('utf-8')) if state != b'ok': post_exist = false except: post_exist = false if post_exist == true: print('cate-{}: {} already exists ... ignore'.format(cate, title)) continue db_cate.put(cate.encode('utf-8'), b'crawling') for item_page in itertools.count(1): url = url_base.format(quote_plus(cate), item_page) for tr in range(5): try: items_obj = json.loads(url_get(url)) break except keyboardinterrupt: quit() except exception as e: if tr == 4: raise e if len(items_obj['listitem']) == 0: break for item in items_obj['listitem']: item_obj = dict( _id = int(item['itemnumid']), name = item['name'], price = float(item['price']), query = cate, category = int(item['category']) if item['category'] != '' else 0, nick = item['nick'], area = item['area']) db_item.put(str(item_obj['_id']).encode('utf-8'), json.dumps(item_obj, ensure_ascii = false).encode('utf-8')) print('get {} items from {}: {}'.format(len(items_obj['listitem']), cate, item_page)) if 'nav' in items_obj: for na in items_obj['nav']['navcatlist']: try: db_cate.get(na['name'].encode('utf-8')) except: db_cate.put(na['name'].encode('utf-8'), b'waiting') db_cate.put(cate.encode('utf-8'), b'ok') print(cate, 'ok') except keyboardinterrupt: break except exception as e: print('an {} exception occured'.format(e))def cate_thread(cate_queue, db_cate): while true: try: for key, value in db_cate.rangeiter(): if value != b'ok': print('catethread: put {} into queue'.format(key.decode('utf-8'))) cate_queue.put(key.decode('utf-8')) time.sleep(10) except keyboardinterrupt: break except exception as e: print('catethread: {}'.format(e))if __name__ == '__main__': db_cate = leveldb.leveldb('./taobao-cate') db_item = leveldb.leveldb('./taobao-item') orig_cate = '正装' try: db_cate.get(orig_cate.encode('utf-8')) except: db_cate.put(orig_cate.encode('utf-8'), b'waiting') cate_queue = queue(maxsize = 1000) cate_th = thread(target = cate_thread, args = (cate_queue, db_cate)) cate_th.start() item_th = [thread(target = item_thread, args = (cate_queue, db_cate, db_item)) for _ in range(5)] for item_t in item_th: item_t.start() cate_th.join()
希望本文所述对大家的python程序设计有所帮助。