很早之前写过的一个爬虫,今天收拾电脑,觉得代码还是有参考价值的,发出来,仅供参考。
首先是对象定义代码:
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/items.html
import scrapy
class PiaoliangmmItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
title = scrapy.Field()
link = scrapy.Field()
filename = scrapy.Field()
description = scrapy.Field()
category = scrapy.Field()
然后是爬虫代码:
#-*-coding:utf-8-*-
import scrapy
import re
import string
from scrapy.selector import Selector
from piaoliangmm.items import PiaoliangmmItem
class PiaoLiangmmSpider(scrapy.Spider):
name = "piaoliangmm"
allowed_domains = ["mmonly.cc"]
start_urls = [
"http://www.mmonly.cc/glamour/",
"http://www.mmonly.cc/beauty/",
"http://www.mmonly.cc/photo/",
"http://www.mmonly.cc/korea/",
"http://www.mmonly.cc/beautyleg/",
"http://www.mmonly.cc/cosplay/",
"http://www.mmonly.cc/jiepaimeinv/"
]
def parse(self, response):
for list in response.xpath("//div[@class='c s_li zxgx_list l']/ul/li/a"):
url = list.xpath("@href").extract()[0]
#print url
#print the url in the first page
#print url
#call the url
yield scrapy.Request(url, callback = self.parse_item_contents, dont_filter = True)
#handle the next page
nextpage = response.xpath("//div[@class='pages c mt5']/ul/li[last()-1]/a/text()").extract()[0]
if (nextpage == unicode('下一页','utf-8')):
#print response.xpath("//div[@class='pages c mt5']/ul/li[last()-1]/a").xpath("@href").extract()[0]
#print response.url.split('/')[-3]
#print response.url.split('/')[-2]
inner_url = "http://" + response.url.split('/')[-3] + "/" + response.url.split('/')[-2] + "/" + response.xpath("//div[@class='pages c mt5']/ul/li[last()-1]/a").xpath("@href").extract()[0]
#print inner_url
yield scrapy.Request(inner_url, callback = self.parse_next_page)
def parse_item_contents(self, response):
#print "parse_item_contents" + response.url
page_text = response.xpath("//div[@class='pages c mt5']/ul/li[1]/a/text()").extract()[0]
page_temp = re.findall(r'(\w*[0-9]+)\w*', page_text)[0]
page_num = string.atoi(page_temp)
#for test
#page_num = 3
for i in range(1, page_num + 1):
#print "i = " + str(i)
if (i == 1):
next_url = response.url
else:
next_url = "http://" + response.url.split('/')[2] + "/" + response.url.split('/')[3] + "/" +response.url.split('/')[-1].split('.')[0] + "_" + str(i) + ".html"
#print "next_url = " + next_url
yield scrapy.Request(next_url, callback = self.parse_image_page, dont_filter = True)
def parse_image_page(self, response):
#print "test = " + response.url
mm = PiaoliangmmItem()
title = response.xpath("//div[@class='atc_tit mt10']/h1/text()").extract()[0]
if title:
p = re.compile(r'\(.*?\)')
mm['title'] = p.sub("", title)
#print mm['title']
link = response.xpath("//div[@id='picView']/p[@align='center']/a/img").xpath("@src").extract()[0]
if link:
mm['link'] = link
#print mm['link']
filename = response.url.split('/')[-1] .split('.')[0]+ '.jpg'
if filename:
mm['filename'] = filename
description = response.xpath("//div[@class='mt5 art_txt']/h2/text()").extract()[0]
if description:
mm['description'] = description
#print mm['description']
category = response.xpath("//div[@class='position mt10']/a[2]/text()").extract()[0]
if category:
mm['category'] = category
return mm
def parse_next_page(self, response):
#print "come parse_next_page"
for list in response.xpath("//div[@class='c s_li zxgx_list l']/ul/li/a"):
url = list.xpath("@href").extract()[0]
#print the url in the first page
#print url
#call the url
yield scrapy.Request(url, callback = self.parse_item_contents, dont_filter = True)
#handle the next page
nextpage = response.xpath("//div[@class='pages c mt5']/ul/li[last()-1]/a/text()").extract()[0]
if (nextpage == unicode('下一页','utf-8')):
print response.xpath("//div[@class='pages c mt5']/ul/li[last()-1]/a").xpath("@href").extract()[0]
#print response.url.split('/')[-3]
#print response.url.split('/')[-2]
inner_url = "http://" + response.url.split('/')[-3] + "/" + response.url.split('/')[-2] + "/" + response.xpath("//div[@class='pages c mt5']/ul/li[last()-1]/a").xpath("@href").extract()[0]
#print inner_url
yield scrapy.Request(inner_url, callback = self.parse_next_page, dont_filter = True)
其次是管道处理代码:
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
from twisted.enterprise import adbapi
import MySQLdb
import MySQLdb.cursors
import urllib
import sys
import os
from piaoliangmm.items import PiaoliangmmItem
class PiaoliangmmPipeline(object):
def __init__(self):
self.dbpool = adbapi.ConnectionPool('MySQLdb',host='127.0.0.1',db='mmpic',user='root',passwd='root',cursorclass=MySQLdb.cursors.DictCursor,charset='utf8',use_unicode=True)
def process_item(self, item, spider):
#if (os.path.exists(r'F:/mmonlypic/' + item['title'])==False):
#os.makedirs(r'F:/mmonlypic/' + item['title'])
#if (item['filename']):
#local = "F:/mmonlypic/" + item['title'] + "/" + item['filename']
#print "file = " + local
#urllib.urlretrieve(item['link'], local)
#query = self.dbpool.runInteraction(self._conditional_insert, item)
query = self.dbpool.runInteraction(self._conditional_insert, item)
return item
def _conditional_insert(self, tx, item):
#sql = "insert into mmonly (title, link, filename, description) values (%s, %s, %s, %s)"
sql = "update mmonly set category = %s where link = %s"
#tx.execute(sql, (item['title'], item['link'], item['filename'], item['description']))
tx.execute(sql, (item['category'], item['link']))
再者是配置脚本:
# -*- coding: utf-8 -*-
# Scrapy settings for piaoliangmm project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# http://doc.scrapy.org/en/latest/topics/settings.html
# http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
# http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
BOT_NAME = 'piaoliangmm'
SPIDER_MODULES = ['piaoliangmm.spiders']
NEWSPIDER_MODULE = 'piaoliangmm.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'piaoliangmm (+http://www.yourdomain.com)'
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS=32
# Configure a delay for requests for the same website (default: 0)
# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY=3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN=16
#CONCURRENT_REQUESTS_PER_IP=16
# Disable cookies (enabled by default)
#COOKIES_ENABLED=False
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED=False
# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
#}
# Enable or disable spider middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# 'piaoliangmm.middlewares.MyCustomSpiderMiddleware': 543,
#}
# Enable or disable downloader middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
# 'piaoliangmm.middlewares.MyCustomDownloaderMiddleware': 543,
#}
# Enable or disable extensions
# See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# 'scrapy.telnet.TelnetConsole': None,
#}
# Configure item pipelines
# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
'piaoliangmm.pipelines.PiaoliangmmPipeline': 300,
}
# Enable and configure the AutoThrottle extension (disabled by default)
# See http://doc.scrapy.org/en/latest/topics/autothrottle.html
# NOTE: AutoThrottle will honour the standard settings for concurrency and delay
#AUTOTHROTTLE_ENABLED=True
# The initial download delay
#AUTOTHROTTLE_START_DELAY=5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY=60
# Enable showing throttling stats for every response received:
AUTOTHROTTLE_DEBUG=False
# Enable and configure HTTP caching (disabled by default)
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED=True
#HTTPCACHE_EXPIRATION_SECS=0
#HTTPCACHE_DIR='httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES=[]
#HTTPCACHE_STORAGE='scrapy.extensions.httpcache.FilesystemCacheStorage'
文章的脚注信息由WordPress的wp-posturl插件自动生成
微信扫一扫,打赏作者吧~![[整理]how to run flask with pyqt5](http://www.jyguagua.com/wp-content/themes/begin/timthumb.php?src=http://www.jyguagua.com/wp-content/uploads/2021/03/pyqt_flask.png&w=280&h=210&zc=1)
![[已解决]LINK : fatal error LNK1158: cannot run 'rc.exe' 错误的解决办法](http://www.jyguagua.com/wp-content/themes/begin/timthumb.php?src=http://www.jyguagua.com/wp-content/uploads/2021/02/Snipaste_2021-02-17_15-18-26-1024x505.png&w=280&h=210&zc=1)
![[已解决]Python扩展模块 error: Unable to find vcvarsall.bat](http://www.jyguagua.com/wp-content/themes/begin/timthumb.php?src=http://www.jyguagua.com/wp-content/uploads/2020/11/Snipaste_2020-11-19_10-01-38.png&w=280&h=210&zc=1)
![[整理]PyQt画圆,动态变色](http://www.jyguagua.com/wp-content/themes/begin/timthumb.php?src=http://www.jyguagua.com/wp-content/uploads/2020/08/drawCircle.gif&w=280&h=210&zc=1)