#!/usr/bin/env python
#coding:utf-8import urllib2,re,sys,os,types #from bs4 import BeautifulSoup reload(sys);sys.setdefaultencoding('gbk'); province="上海"city="上海"fileHeader='\xEF\xBB\xBF' colums='省直辖市^城市^行政区^商圈^名称^地址^联系人^联系电话^URL^公司介绍^'def getCompany(method): for page in range(1,5+1): url1="http://product.china-pub.com/cache/rank3/newbook/%s_%s.html"%(method,page) print "\n##################:",url1 httpCrawler(url1,page,method) def httpCrawler(url,page,method): content = httpRequest(url) #<tr logr='j_2_27359935228167_20019655228034_3'> List=re.findall(r'<td height="17" style="overflow: hidden;" colspan="5">(.*?)<a href="(.*?)" target="_blank">(.*?)</a>',content,re.S) no=len(List) print no method1=method.replace("\/","") for i in range(0,no):#0 ~ no-1 url=List[i][1] name=List[i][2] name1=name.replace("/","").replace(u"+微信营销与运营:策略、方法、技巧与实践+微信营销解密:移动互联网时代的营销革命","") print "\ndownload one page:",List[i][1],"\n",List[i][2] if not os.path.exists('./%s'%method1): os.mkdir(r'./%s'%method1) content = httpRequest(url)# if (page-1)*20+i+1 != 82: open(u'%s/%s.%s'%(method1,(page-1)*20+i+1,name1+'.html'),'w+').write(content) print "ok"def httpRequest(url): #try: html = None req_header = { 'User-Agent':'Mozilla/5.0 (X11; Linux x86_64; rv:38.0) Gecko/20100101 Firefox/38.0', 'Accept':'text/html;q=0.9,*/*;q=0.8', #'Accept-Language':'en-US,en;q=0.5', #'Accept-Encoding':'gzip', #'Host':'j3.s2.dpfile.com', #'Connection':'keep-alive' #'Referer':'http://www.baidu.com' } req_timeout = 15 req = urllib2.Request(url,None,req_header) resp = urllib2.urlopen(req,None,req_timeout) html = resp.read()#.decode('gbk').encode('gbk') print "resp:",resp #print html #finally: # if resp: # resp.close() return htmldef writeHeader(fileheader,colums): if not os.path.exists('./58'): os.mkdir(r'./58') f = open('./58/daikuan.csv', 'w') f.write(fileheader) f.write(colums) #f.write('\r\n') f.close()array=('day\/rank_day_7_51','day\/rank_day_30_51','day\/rank_day_90_51','month\/rank_month_7_51','month\/rank_month_6_51','month\/rank_month_5_51','month\/rank_month_4_51','month\/rank_month_3_51','month\/rank_month_2_51','month\/rank_month_1_51','day\/rank_day_7_02','day\/rank_day_30_02','day\/rank_day_90_02','month\/rank_month_7_02','month\/rank_month_6_02','month\/rank_month_5_02','month\/rank_month_4_02','month\/rank_month_3_02','month\/rank_month_2_02','month\/rank_month_1_02','day\/rank_day_7_31','day\/rank_day_30_31','day\/rank_day_90_31','month\/rank_month_7_31','month\/rank_month_6_31','month\/rank_month_5_31','month\/rank_month_4_31','month\/rank_month_3_31','month\/rank_month_2_31','month\/rank_month_1_31','day\/rank_day_7_57','day\/rank_day_30_57','day\/rank_day_90_57','month\/rank_month_7_57','month\/rank_month_6_57','month\/rank_month_5_57','month\/rank_month_4_57','month\/rank_month_3_57','month\/rank_month_2_57','month\/rank_month_1_57','day\/rank_day_7_47','day\/rank_day_30_47','day\/rank_day_90_47','month\/rank_month_7_47','month\/rank_month_6_47','month\/rank_month_5_47','month\/rank_month_4_47','month\/rank_month_3_47','month\/rank_month_2_47','month\/rank_month_1_47','day\/rank_day_7_46','day\/rank_day_30_46','day\/rank_day_90_46','month\/rank_month_7_46','month\/rank_month_6_46','month\/rank_month_5_46','month\/rank_month_4_46','month\/rank_month_3_46','month\/rank_month_2_46','month\/rank_month_1_46','day\/rank_day_7_60','day\/rank_day_30_60','day\/rank_day_90_60','month\/rank_month_7_60','month\/rank_month_6_60','month\/rank_month_5_60','month\/rank_month_4_60','month\/rank_month_3_60','month\/rank_month_2_60','month\/rank_month_1_60','day\/rank_day_7_52','day\/rank_day_30_52','day\/rank_day_90_52','month\/rank_month_7_52','month\/rank_month_6_52','month\/rank_month_5_52','month\/rank_month_4_52','month\/rank_month_3_52','month\/rank_month_2_52','month\/rank_month_1_52','day\/rank_day_7_59','day\/rank_day_30_59','day\/rank_day_90_59','month\/rank_month_7_59','month\/rank_month_6_59','month\/rank_month_5_59','month\/rank_month_4_59','month\/rank_month_3_59','month\/rank_month_2_59','month\/rank_month_1_59''day\/rank_day_7_28','day\/rank_day_30_28','day\/rank_day_90_28','month\/rank_month_7_28','month\/rank_month_6_28','month\/rank_month_5_28','month\/rank_month_4_28','month\/rank_month_3_28','month\/rank_month_2_28','month\/rank_month_1_28',)if __name__ == '__main__': #writeHeader(fileHeader,colums) print len(array) for i in range(0,len(array)): getCompany(array[i])