created at 2023/08/03 03:35:05
updated at 2023/08/03 04:05:20
Python
import json
import requests
import re
from itertools import compress
import os
# 请求函数
def request_get(url, ret_type="text", timeout=5, encoding="GBK"):
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.93 Safari/537.36"
}
res = requests.get(url=url, headers=headers, timeout=timeout)
res.encoding = encoding
if ret_type == "text":
return res.text
elif ret_type == "image":
return res.content
# 抓取函数
def main():
# 获取要下载的编号
down_type = get_download_type()
image_type = get_imagetype()
image_type = image_type[down_type]
# 获取当前要下载的图片类型
print(image_type)
# 获取下载总页数
page_count = get_pages(image_type)
print("总页数是:", page_count)
page_count = int(page_count)
urls = [f"http://www.netbian.com" + image_type + "index_" + str(i) + ".htm" for i in range(2, page_count)]
url = "http://www.netbian.com" + image_type + "index.htm"
urls.insert(0, url)
for url in urls:
print("抓取列表页地址为:", url)
text = request_get(url)
# print('抓取函数:', text)
format(text)
# 解析函数
def format(text):
origin_text = split_str(text, '<div class="list">', '<div id="footer">')
# print(origin_text)
pattern = re.compile('href="(.*?)"')
hrefs = pattern.findall(origin_text)
hrefs = [i for i in hrefs if i.find("desk") > 0]
for href in hrefs:
url = f"http://www.netbian.com{href}"
print(f"正在下载:{url}")
text = request_get(url)
# print("页面:", text)
format_detail(text)
# 获取总页数
def get_pages(image_type):
url = "http://www.netbian.com" + image_type + "index.htm"
print(url)
text = request_get(url)
origin_text = split_str(text, '<div class="list">', '<div id="footer">')
pattern = re.compile('href="(.*?)"')
hrefs = pattern.findall(origin_text)
select = [1, 1, -1, -1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, -1]
selected = [n > 0 for n in select]
select_hrefs = list(compress(hrefs, selected))
# print(select_hrefs)
total_page = select_hrefs[-2]
total_page = re.findall('\d+', total_page)
print(total_page[0])
return total_page[0]
# 初始化
def get_download_type():
key_list = []
val_list = []
image_type = ['日历', '动漫', '风景', '美女', '游戏', '影视', '动态', '唯美',
'设计', '可爱', '汽车', '花卉', '动物', '节日', '人物', '美食', '水果',
'建筑', '体育', '军事', '非主流', '其他', '王者荣耀', '护眼',
'英雄联盟']
select = [0, 1, 2, 3, 4, 5, 6, 7, 8,
9, 10, 11, 12, 13, 14, 15, 16, 17,
18, 19, 20, 21, 22, 23, 24]
selected = dict(zip(image_type, select))
print(json.dumps(selected, ensure_ascii=False, indent=4))
download_num = input("请选择要下载的类型:")
for k, v in selected.items():
key_list.append(k)
val_list.append(v)
download_num = int(download_num)
if download_num in val_list:
val_index = val_list.index(int(download_num))
print("你选择要下载的类型是:", key_list[val_index])
return download_num
# 获取image分类列表
def get_imagetype():
url = 'http://www.netbian.com/'
res_content = request_get(url)
origin_text = split_str(res_content, '<div class="head">', '<div class="search">')
pattern = re.compile('href="(.*?)"')
hrefs = pattern.findall(origin_text)
# 第一次过滤:filter
# 第二次过滤:切片
image_type_url = list(filter(lambda h: str(h).startswith('/'), hrefs))[:-10]
return image_type_url
# index.htm切割
def split_str(text, s_html, e_html):
start = text.find(s_html) + len(e_html)
end = text.find(e_html)
origin_text = text[start:end]
return origin_text
def format_detail(text):
origin_text = split_str(text, '<div class="pic">', '<div class="pic-down">')
# print(origin_text)
pattern = re.compile('src="(.*?)"')
image_src = pattern.search(origin_text).group(1)
print("图片源地址:", image_src)
# 获取图片名称
pattern = re.compile('title="(.*?)"')
image_name = pattern.search(origin_text).group(1)
# print(image_name, type(image_name))
# 保存图片
save_image(image_src, image_name)
# 存储函数
def save_image(image_src, image_name):
content = request_get(image_src, "image")
pwd = os.getcwd()
filename = pwd + '\\' + 'Downloads'
if not os.path.exists(filename):
os.makedirs(filename)
else:
print("目录已存在")
with open(filename + '\\' + f"{image_name}.jpg", "wb") as f:
f.write(content)
print("图片保存成功")
if __name__ == '__main__':
main()
Python
# 爬取某网站的壁纸图片
import os
import random
import requests
from lxml import etree
import time
# 伪装浏览器
headers = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_5)AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.104 Safari/537.36",
}
# 定义创建文件路径函数,将下载的文件存储到该路径
def CreatePath(filepath):
if not os.path.exists(filepath):
os.makedirs(filepath)
# 获取壁纸首页网页信息并解析
def getUrlText(url):
respons = requests.get(url, headers=headers) # 获取网页信息
urlText = respons.text
html = etree.HTML(urlText) # 使用lxml解析网页
return html
# 提取壁纸链接地址列表
def getWallUrl(url):
hrefUrl = getUrlText(url)
section = hrefUrl.xpath('//section[@class="thumb-listing-page"]')[0] # 获取section标签
hrefList = section.xpath('./ul//@href') # 获取首页图片对应链接地址
return hrefList
# 获取当前时间
def getTime():
nowtime = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
return nowtime
# 解析壁纸下载地址
def downWall(url, page):
'''
:param url: 网页地址
:param page: 下载页数
:return: 下载结束提醒
'''
global i, n
m = 0
page += 1
for i in range(1, page):
hrefList = getWallUrl(url + str(i))
print('第' + str(i) + '页')
print(hrefList)
n = 0
print('开始下载第{}页壁纸'.format(i))
for href in hrefList:
n += 1
imgUrl = getUrlText(href) # 获取壁纸链接网页信息并解析
imgSrc = imgUrl.xpath('//img[@id="wallpaper"]/@src')[0].strip()
print(imgSrc)
try:
res = requests.get(imgSrc)
print(res, res.status_code)
pic_path = '/wallpaper/Picture/' + imgSrc[31:]
print(pic_path)
with open(pic_path, 'wb') as f:
f.write(res.content)
f.close()
print('{}:第{}页第{}张壁纸下载完成'.format(getTime(), i, n))
time.sleep(random.uniform(0, 3))
except Exception as e:
print(repr(e))
m = m + n
return print('{}:所有壁纸已下载完成,一共{}页{}张'.format(getTime(), i, m))
# url = 'https://wallhaven.cc/search?q=id%3A711&ref=fp&tdsourcetag=s_pcqq_aiomsg&page='
def main():
filepath = ('/wallpaper/Picture/') # 存储路径。
page = int(input('请输入你想下载的页数:'))
CreatePath(filepath)
downWall('https://wallhaven.cc/search?q=id%3A3799&page=', page)
if __name__ == '__main__':
main()