爬取淘女郎前100页所有图片

python

浏览数:566

2019-1-8


爬取淘女郎前100页所有图片.py

import requests
import re
import json
import os 

'''
作者:小松叔
操作系统:win10专业版
编程语言:python3.5.2
'''
#创建根目录
root = "D://淘女郎//"
if not os.path.exists(root):#判断创建的目录是否存在,如果不存在则创建
    os.mkdir(root)

number_list = [] #用于储存每个女郎的id号
user_id_list = []#用来储存每个女郎的user_id
album_id_list = []#用于储存每个女郎的每个相册id
user_url = "https://mm.taobao.com/json/request_top_list.htm?page="

#获取每个女郎的user_id和 user_id号
for i in range(1,10):#遍历女郎列表前10页,在这10页中获取每个女郎的user_id
    url1 = user_url + str(i)
    try:
        user_r = requests.get(url1)
        user_r.raise_for_status()
        user_r.encoding = user_r.apparent_encoding
        user_html = user_r.text
        user_id = re.findall(r'user_id=[0-9]{0,9}',user_html)#利用正则表达式,匹配每个女郎的user_id 
        for i in user_id:
            user_id_list.append(i)
        for i in user_id_list:
            number = re.search('[0-9]{1,10}',i).group(0)#遍历user_id_list,利用正则获取每个女郎的user_id号,并将其保存在number_list中
            number_list.append(number)
    except:
        print("gain web error!")
    
#获取每个女郎的相册id
for Number in number_list:
    first_url = "https://mm.taobao.com/self/album/open_album_list.htm?_charset=utf-8&user_id%20="
    url = first_url + str(Number)
    for i in range(1,6):#每个女郎有5页的相册,遍历每一页。
        album_id_link = url + "&page" + str(i)
        album_id_link_r = requests.get(album_id_link)
        album_id_link_r.encoding = album_id_link_r.apparent_encoding
        album_id_link_html = album_id_link_r.text
        album_id = re.findall('album_id=[0-9]*',album_id_link_html)#利用正则,找到相册ID
    for i in album_id:
        if i not in album_id_list:
            album_id_list.append(i)    for i in album_id_list:
        album_folder = root + i+"//"
        if not os.path.exists(album_folder):#创建相册目录,用于保存对应相册里的照片
            os.mkdir(album_folder)
        for k in range(1,3):#对于每个相册里的照片,遍历前3页。
            URL = "https://mm.taobao.com/album/json/get_album_photo_list.htm?user_id=" +Number+ "&"+ str(i) + "&top_pic_id=0&cover=%2F%2Fimg.alicdn.com%2Fimgextra&page="+ str(k)
            picture_link_r = requests.get(URL)
            picture_link_r.encoding = picture_link_r.apparent_encoding
            picture_link_html = picture_link_r.text
            picture_link_data = json.loads(picture_link_html)#将json对象转换为python对象,即字典
            print(picture_link_data['isError'])
            if picture_link_data['isError'] == str(0):#如果isError = 1,则表示该页没有可以图片
                picture_link_picList = picture_link_data['picList']
                for i in picture_link_picList:
                    picture_name = i['picUrl'].split('/')[-1]
                    Path = album_folder + picture_name
                    if not os.path.exists(Path):
                        r = requests.get("https:" + i['picUrl'])
                        with open(Path,'wb') as f :
                            f.write(r.content)