本文共 2026 字,大约阅读时间需要 6 分钟。
尝试爬一个网站:
import urllib.request #请求import reimport xlwtdef getContent(job,pag): hd = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36 SLBrowser/7.0.0.4071 SLBChan/21"} url = "https://search.51job.com/list/000000,000000,0000,00,9,99,%s,2,%s.html?lang=c&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare=" url = url%(job,pag) req = urllib.request.Request(url,headers=hd) page = urllib.request.urlopen(req).read().decode("gbk") return pagedef getItem(page): pat = re.compile(r'"job_href":"(.*?)","job_name":"(.*?)".*?"company_href":"(.*?)","company_name":"(.*?)","providesalary_text":"(.*?)".*?"workarea_text":"(.*?)","updatedate":"(.*?)".*?"companytype_text":"(.*?)","degreefrom":"(.*?)".*?"attribute_text":(.*?),"companysize_text":"(.*?)","companyind_text":"(.*?)"') code = re.findall(pat, page) for i in range(0,len(code)): dis = dict(岗位详情=code[i][0],岗位名字=code[i][1],公司网址=code[i][2] ,公司名称 = code[i][3], 待遇 = code[i][4], 工作地点 = code[i][5] ,发布时间=code[i][6],公司类型=code[i][7],学历要求=code[i][8] ,招聘要求=code[i][9],公司规模=code[i][10],行业=code[i][11]) code[i]=dis return codedef saveExcel(result): wb = xlwt.Workbook() # 创建工作表 sheet = wb.add_sheet('51job') list_n = ['岗位详情','岗位名字','公司网址','公司名称','待遇','工作地点','发布时间','公司类型','学历要求','招聘要求','公司规模','行业'] list_h = ['公司名称','公司网址','公司类型','公司规模','行业','工作地点','岗位名字','待遇','岗位详情','发布时间','学历要求','招聘要求'] for row,str in enumerate(list_h): sheet.write(0, row, str) for i in range(0,len(result)): for n,inV in enumerate(list_h): sheet.write(i+1,n,result[i][inV]) wb.save('51job.xlsx')job = input("请输入查询职位")pag = input("请输入查询页数")lism = []for i in range(1,int(pag)+1): loading = float(i)/float(pag)*100 print(str(loading)+'%') page = getContent(job,i) code = getItem(page) lism.extend(code)saveExcel(lism)
转载地址:http://gfuhf.baihongyu.com/