from scrapy import cmdline
cmdline.execute('scrapy crawl image'.split())#分割成输入命令形式
4.编写items.py里面的字段,代码如下:
import scrapy
class MeituluItem(scrapy.Item):
tag=scrapy.Field()#标签
name=scrapy.Field()#姓名
id=scrapy.Field()#图集编号
img_url=scrapy.Field()#图片url,类型为列表
5.编写image.py主爬虫代码,如下:
# -*- coding: utf-8 -*-
import scrapy
from ..items import MeituluItem
class ImageSpider(scrapy.Spider):
name = 'image'
allowed_domains = ['www.meitulu.com']
start_urls = ['https://www.meitulu.com/']
def parse(self, response):
url_pattern='https://mtl.ttsqgs.com/images/img/{0}/{1}.jpg'#图片链接样式
ids=[re.search('(\d+)',url).group(1) for url in response.css('ul.img li > a::attr(href)').extract()]
#eg:['https://www.meitulu.com/item/14224.html', 'https://www.meitulu.com/item/5223.html', 'https://www.meitulu.com/item/3119.html']
tags=[p.css('a::text').extract() for p in response.css('ul.img li p:nth-child(4)')]
#eg: [['惊艳', '性感', '女神'], ['清新', '女神', '养眼', '嫩模', '美胸', '比基尼', '户外'], ['爆乳', '美胸', '诱惑', '极品']]
names=[item.css('a::text').extract()[0] if item.css('a::text').extract() else item.css('::text').extract()[0].strip('模特:') for item in response.css('ul.img li p:nth-child(3)')]
# eg:['龙儿', 'Cheryl青树', '杉原杏璃', '陆芷翊', '于姬', '王雨纯', '安安Angel']
totals=[int(re.search('(\d+)',item).group(1)) for item in response.css('ul.img li p:nth-child(2)::text').extract()]
for id,tag,name,total in zip(ids,tags,names,totals):
item = MeituluItem()
item['name']=name
item['id']=id
item['tag']=tag
item['img_url']=[url_pattern.format(id,str(i)) for i in range(1,total+1)]#构造图片url
yield item