1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80
|
''' # @Time: 7/17/23 9:36 AM # @Author: leazhi # @Emal: [email protected] # @Filename: spider.py # @Project: python '''
import requests import jsonpath import os
class Pexels(): def __init__(self, url): self.url = url self.head = { 'Content-Type': 'application/json', 'Dnt': '1', 'Referer': 'https://www.pexels.com/zh-cn/search/%E7%BD%91%E7%AB%99%E8%83%8C%E6%99%AF/', 'Sec-Ch-Ua': '"Not.A/Brand";v="8", "Chromium";v="114", "Google Chrome";v="114"', 'Sec-Ch-Ua-Mobile': '?0', 'Sec-Ch-Ua-Platform': '"Linux"', 'Secret-Key': 'H2jk9uKnhRmL6WPwh89zBezWvr', 'Sentry-Trace': '7e26e72395084fdcacd8f577f2cd213a-8ec4a419844604d2-0', 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36', 'X-Client-Type': 'react', }
def get_html(self): response = requests.get(url=self.url, data=payload, headers=self.head) data_json = response.json() data_list = data_json['data'] return data_list
def get_data(self, data_list): for index, data in enumerate(data_list): img_url = jsonpath.jsonpath(data, '$..download_link')[0] img_name = jsonpath.jsonpath(data, '$..id')[0] print(f'第 {page} 页的第 {index+1} 张图片名为 {img_name},地址为:' ,img_url) res_img = requests.get(url=img_url, headers=self.head, data=payload) self.save_data(img_name, res_img)
def save_data(self, img_name, res_img): with open(f'{data_dir}/{img_name}.jpg', 'wb') as f: f.write(res_img.content)
def main(self): data_list = self.get_html() self.get_data(data_list)
if __name__ == '__main__': data_dir = 'imgs' if not os.path.exists(data_dir): os.makedirs(data_dir)
page = 1 while page < 10: payload = { 'page': page, 'per_page': '24', 'query': '%E7%BD%91%E7%AB%99%E8%83%8C%E6%99%AF', 'orientation': 'all', 'size': 'all', 'color': 'all', 'seo_tags': 'true', } P = Pexels(f'https://www.pexels.com/zh-cn/api/v3/search/photos?page={page}&per_page=24&query=%E7%BD%91%E7%AB%99%E8%83%8C%E6%99%AF&orientation=all&size=all&color=all&seo_tags=true') P.main()
page += 1
|