爬取图虫图片并下载,scrapy爬取的数额存入MySQL

items.py,根据需求确定本身的数目必要

 1 # -*- coding: utf-8 -*-
 2 
 3 # Define here the models for your scraped items
 4 #
 5 # See documentation in:
 6 # https://doc.scrapy.org/en/latest/topics/items.html
 7 
 8 import scrapy
 9 
10 
11 class TodayScrapyItem(scrapy.Item):
12     # define the fields for your item here like:
13     # name = scrapy.Field()
14     pass
15 
16 
17 class TuchongItem(scrapy.Item):
18     title = scrapy.Field() #图片名字
19     views = scrapy.Field() #浏览人数
20     favorites = scrapy.Field()#点赞人数
21     img_url = scrapy.Field()#图片地址
22 
23     # def get_insert_sql(self):
24     #     # 存储时候用的sql语句
25     #     sql = 'insert into tuchong(title,views,favorites,img_url)' \
26     #           ' VALUES (%s, %s, %s, %s)'
27     #     # 存储的数据
28     #     data = (self['title'], self['views'], self['favorites'], self['img_url'])
29     #     return (sql, data)
运行环境:
* Python 2.7.12  
* Scrapy 1.2.2
* Mac OS X 10.10.3 Yosemite

一、目标 如图

澳门葡京备用网址 1

目标

用scrapy首先要在互动格局下成立项目,然后在items.py中定义数据字段,解析出网页数据后还要先建表,在存入,既然python是一门高级的动态语言,作者想应该能落成多少字段的三回输入,自动建表存入数据。

澳门葡京备用网址 ,使用python:2.7.12

setting.py 设置headers和items

# -*- coding: utf-8 -*-

# Scrapy settings for today_scrapy project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
#     https://doc.scrapy.org/en/latest/topics/settings.html
#     https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
#     https://doc.scrapy.org/en/latest/topics/spider-middleware.html

BOT_NAME = 'today_scrapy'

SPIDER_MODULES = ['today_scrapy.spiders']
NEWSPIDER_MODULE = 'today_scrapy.spiders'


# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'today_scrapy (+http://www.yourdomain.com)'

# Obey robots.txt rules
ROBOTSTXT_OBEY = False

# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32

# Configure a delay for requests for the same website (default: 0)
# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16

# Disable cookies (enabled by default)
#COOKIES_ENABLED = False

# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False

# Override the default request headers:
DEFAULT_REQUEST_HEADERS = {
  'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
  'Accept-Language': 'en',
  'User-Agnet':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36'
}

# Enable or disable spider middlewares
# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
#    'today_scrapy.middlewares.TodayScrapySpiderMiddleware': 543,
#}

# Enable or disable downloader middlewares
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
#    'today_scrapy.middlewares.TodayScrapyDownloaderMiddleware': 543,
#}

# Enable or disable extensions
# See https://doc.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
#    'scrapy.extensions.telnet.TelnetConsole': None,
#}

# Configure item pipelines
# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
   # 'today_scrapy.pipelines.TodayScrapyPipeline': 300,
    'today_scrapy.pipelines.TuchongPipeline': 200,

}

# Enable and configure the AutoThrottle extension (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False

# Enable and configure HTTP caching (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'

后续爬取Scrapy 1.2.2文档提供的勤学苦练网址:

二、完毕格局

应当说有二种完毕格局:
一种是先建表,读取表字段,写入items.py中,
一种是先在items.py中定义字段,读取出字段,再建表,小编选拔的是第三种。

始于草稿,完毕的大致逻辑是先写一个创办项目标类,传入项目名称参数已毕项目成立,然后在开立的items.py中先定义字段,然后读取items.py中的类名,字段名,重新覆盖生成items.py。

一、MongoDB

    一个小例子

1
2
1.spider:dmoz_item.py

from dmoz.items import DmozItem

class DmozItemSpider(scrapy.Spider):

    name = "dmoz_item"
    #allowed_domains = ["dmoz.org"]
    start_urls = ['http://www.dmoz.org/Computers/Programming/Languages/Python/Books/']

    def parse(self, response):
        list=response.xpath('/html/body/div[5]/div/section[3]/div/div/div/div[3]')
        for i in list:
            item=DmozItem()
            item['link']=i.xpath('a/@href').extract()
            item['title']=i.xpath('a/div/text()').extract()
            item['desc']=i.xpath('div/text()').extract()
            yield item

pipelines.py 将图片下载到指定文件夹

 1 # -*- coding: utf-8 -*-
 2 
 3 # Define your item pipelines here
 4 #
 5 # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 6 # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
 7 import os
 8 import requests
 9 
10 class TodayScrapyPipeline(object):
11     def process_item(self, item, spider):
12         return item
13 
14 class TuchongPipeline(object):
15     def process_item(self, item, spider):
16         img_url = item['img_url'] #从items中得到图片url地址
17         img_title= item['title'] #得到图片的名字
18         headers = {
19             'User-Agnet': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36',
20             'cookie':'webp_enabled=1; bad_ide7dfc0b0-b3b6-11e7-b58e-df773034efe4=78baed41-a870-11e8-b7fd-370d61367b46; _ga=GA1.2.1188216139.1535263387; _gid=GA1.2.1476686092.1535263387; PHPSESSID=4k7pb6hmkml8tjsbg0knii25n6'
21         }
22         if not os.path.exists(img_title):
23             os.mkdir(img_title)
24         filename =img_url.split('/')[-1]
25         with open(img_title+'/'+filename, 'wb+') as f:
26             f.write(requests.get(img_url, headers=headers).content)
27         f.close()
28         return item

“http://quotes.toscrapy.com”

三、先导代码落成

2.items: items.py

import scrapy

class DmozItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    title=scrapy.Field()
    desc=scrapy.Field()
    link=scrapy.Field()

爬虫文件

可以临时不用考虑爬虫被封的处境,用于初级爬虫锻练。

1、定义一个创设项目标类

在create_project.py文件中 定义一个创制项目标类
传入项目名称即可自动创立项目

#coding:utf-8
import os
from scrapy import cmdline

class Project():
    def __init__(self,project):
        self.project=project
    def create_project(self):
        filepath=os.getcwd()
        os.chdir(filepath)
        line="scrapy startproject %s" % self.project
        print line
        cmdline.execute(line.split())
        return None

最主要的上菜:

3.setting:settings.py

ITEM_PIPELINES = {
   'dmoz.pipelines.DmozPipeline': 300,
}

MONGODB_HOST = '127.0.0.1'
MONGODB_PORT = 27017
MONGODB_DBNAME = 'spider1'
MONGODB_DOCNAME = 'book_item'

爬取图虫图片并下载,scrapy爬取的数额存入MySQL。tuchong.py

图形的url可以直接拼接

 1 # -*- coding: utf-8 -*-
 2 import scrapy
 3 import json
 4 from today_scrapy.items import TuchongItem
 5 
 6 
 7 class TuchongSpider(scrapy.Spider):
 8     name = 'tuchong'
 9     allowed_domains = ['tuchong.com']
10     start_urls = ['http://tuchong.com/']
11 
12     def start_requests(self):
13         for pag in range(1, 20):
14             referer_url = 'https://tuchong.com/rest/tags/自然/posts?page={}&count=20'.format(pag)   # url中红字部分可以换
15             form_req = scrapy.Request(url=referer_url, callback=self.parse)
16             form_req.headers['referer'] = referer_url
17             yield form_req
18 
19     def parse(self, response):
20         tuchong_info_html = json.loads(response.text)
21         # print(tuchong_info_html)
22         postList_c = len(tuchong_info_html['postList'])
23         # print(postList_c)
24         for c in range(postList_c):
25             print(c)
26             # print(tuchong_info_html['postList'][c])
27             title = tuchong_info_html['postList'][c]['title']
28             print('图集名称:'+title)
29             views = tuchong_info_html['postList'][c]['views']
30             print('有'+str(views)+'人浏览')
31             favorites = tuchong_info_html['postList'][c]['favorites']
32             print('喜欢的人数:'+str(favorites))
33             images_c = len(tuchong_info_html['postList'][c]['images'])
34             for img_c in range(images_c):
35                 user_id = tuchong_info_html['postList'][c]['images'][img_c]['user_id']
36                 img_id = tuchong_info_html['postList'][c]['images'][img_c]['img_id']
37                 img_url = 'https://photo.tuchong.com/{}/f/{}.jpg'.format(user_id,img_id)
38                 item = TuchongItem()
39                 item['title'] = title
40                 item['img_url'] = img_url
41             # 返回我们的item
42                 yield item

 

目标

使用items来包装需求爬取的情节。把内容都用items.py来开展管理,便于把抓取的始末传递进pipelines进行前期处理。同时,把内容都放进items.py将来,可以解耦合爬虫文件spider.py,义务进一步清晰:爬虫负责去发请求,解析网址;items.py负责管理抓取到的内容。

2、新建一个create_table.py 读取items.py中类名、字段数据

#coding:utf-8
import os
import MySQLdb
#引入创建项目的类
from create_project import Project

#定义项目名
project_name='Tttt'
#实例化引用过来的类进行项目创建
project=Project(project_name)
#获取items.py路径
path1=os.getcwd()
path=os.path.join(path1,'items.py')

#下面创建之前 需要在items.py中定义 字段=Field()

#读取创建好的项目中的items
file=open(path,'r')
data=file.readlines()
#两个空列表 用来存储读取出的关键性数据
field_listx=[]
field_listy=[]
class_sign='class '
sign_1=''
sign_2='(Item):'

for _class in data:
    if 'class' in _class:
        class_name=_class.replace('(scrapy.Item):','').replace(class_sign,'').strip('\n')
        get_class_name=class_sign+sign_1+class_name+sign_2
    if '='in _class:
    #print i
        field_listx.append(_class)
        fieldy=_class.split('=')[0].strip()+' varchar(100)'
        field_listy.append(fieldy)
file.close()

#打印出scrapy自动创建的类名
print get_class_name
items_dict={'code':"#coding:utf-8",'moduel':"from scrapy import Item,Field"}

#覆盖生成创建items:
items=open('items.py','w')
items.write(items_dict['code']+'\n')
items.write(items_dict['moduel']+'\n')
items.write('\n')
items.write(get_class_name+'\n')
for field_x in field_listx:
    items.write(' '*4+field_x)
items.close()

#建表操作
Field=str(field_listy)
data=Field.replace("'",'').replace('[','').replace(']','')
del_sql='drop table  if EXISTS %s' % project_name
create_sql=('create table %s('+data+') engine=innodb default charset=utf8') % project_name
print create_sql
conn=MySQLdb.connect(host='localhost',user='root',passwd='密码',db='local_db',charset='utf8',port=3306)
with conn:
    cursor=conn.cursor()
    #如果存在表先删除 以免报错
    cursor.execute(del_sql)
    cursor.execute(create_sql)
    conn.commit()

4.最后是:pipelines pipelines.py

只顾:那种措施下,是从scrapy.conf中import settings

from scrapy.conf import settings
import pymongo

class DmozPipeline(object):
    # def process_item(self, item, spider):
    #     return item
    def __init__(self):
        port = settings['MONGODB_PORT']
        host = settings['MONGODB_HOST']
        db_name = settings['MONGODB_DBNAME']
        client = pymongo.MongoClient(host=host, port=port)
        db = client[db_name]
        self.post = db[settings['MONGODB_DOCNAME']]

    def process_item(self, item, spider):
         book_info = dict(item)
         self.post.insert(book_info)
         return item

改写第三个爬虫

3、操作注意点

在create_table.py中由于是先创设项目,创设完项目其所在的不二法门和类型的根目录是相同级别,而读取items.py的代码是在类型下边的构造中读取,因而创制完之后需求将建好的档次类
create_project.py和create_table.py
复制移动到和items.py在同一个目录,那也是下一步要化解的标题,还有自动插入数据的sql
封装一下

MONGO 方法2 此测试成功

先是开启本地的MongoDB服务

sudo service mongodb start

步骤1:声明items

率先,咱们本着第三个爬虫进展改写。

在项目目录下有items.py文本。这是存放在items的地点,也等于存放在抓取内容的地点。我们需求在items.py中告诉Scrapy大家要抓取的内容叫什么名字,也等于索要注解items

items.py文本改写如下:

import scrapy

class QuotesItem(scrapy.Item):
    quote = scrapy.Field()
    author = scrapy.Field()
    tags = scrapy.Field()

代表大家要抓取的始末是:quoteauthor以及tags

在settings.py文件中添加数据库的布局项

MONGO_HOST = "127.0.0.1"  # 主机IP
MONGO_PORT = 27017  # 端口号
MONGO_DB = "Spider"  # 库名 
MONGO_COLL = "heartsong"  # collection名
# MONGO_USER = "zhangsan"
# MONGO_PSW = "123456"

步骤2:引入items.py的类

建立新爬虫文件quotes_2_4.py,并将率先个爬虫文件的始末复制如下:

import scrapy

class QuotesSpider(scrapy.Spider):
    name = 'quotes_2_1'
    start_urls = [
        'http://quotes.toscrape.com'
    ]
    allowed_domains = [
        'toscrape.com'
    ]

    def parse(self,response):
        for quote in response.css('div.quote'):
            yield{
                'quote': quote.css('span.text::text').extract_first(),
                'author': quote.css('small.author::text').extract_first(),
                'tags': quote.css('div.tags a.tag::text').extract(),
            }

首先改变多个内容:

  • 在文件开头引入items,from quotes_2.items import QuotesItemquotes_2.items表示quotes_2品种下的items文件,import QuotesItem是引入QuotesItem以此类,上一段就是在这些类中申明的items。(即使评释了三个类,能够行使from <项目名>.items import *代表引入items.py中所有的类。
  • 改变爬虫名字,name = 'quotes_2_4'

下一场编写Pipelines.py

# -*- coding: utf-8 -*-

import pymongo
from scrapy.conf import settings

class HeartsongPipeline(object): #改定义名和setting文件的time中的后半部分保持一致
    def __init__(self):
        # 链接数据库
        self.client = pymongo.MongoClient(host=settings['MONGO_HOST'], port=settings['MONGO_PORT'])
        # 数据库登录需要帐号密码的话
        # self.client.admin.authenticate(settings['MINGO_USER'], settings['MONGO_PSW'])
        self.db = self.client[settings['MONGO_DB']]  # 获得数据库的句柄
        self.coll = self.db[settings['MONGO_COLL']]  # 获得collection的句柄

    def process_item(self, item, spider):
        postItem = dict(item)  # 把item转化成字典形式
        self.coll.insert(postItem)  # 向数据库插入一条记录
        return item  # 会在控制台输出原item数据,可以选择不写

那里要验证一些就是在动用MongoDB的时候差距于MySql,不用事先定义好数据表和表结构。大家用insert语句插入的时候,即使collection还不存在,他会自动被成立出来。


修改好将来就可以跑起来了,命令行输入大家曾经纯熟的

scrapy crawl heartsong

步骤3:改写parse()函数

下一场,须要改变parse()函数中的yield{}内容,parse()函数改写如下:

    def parse(self,response):
        for quote in response.css('div.quote'):
            item = QuotesItem()
            item['quote'] = quote.css('span.text::text').extract_first()
            item['author'] = quote.css('small.author::text').extract_first()
            item['tags'] = quote.css('div.tags a.tag::text').extract()
            yield item

具体内容是:

  • 实例化item,item = QuotesItem()
  • 对item中的变量赋值。
  • yield item。

诸如此类就贯彻了items来包装抓取内容,达到items.py来治本内容的目标。

末段的爬虫文件如下:

import scrapy
from quotes_2.items import QuotesItem

class QuotesSpider(scrapy.Spider):
    name = 'quotes_2_4'
    start_urls = [
        'http://quotes.toscrape.com',
    ]
    allowed_domains = [
        'toscrape.com',
    ]

    def parse(self,response):
        for quote in response.css('div.quote'):
            item = QuotesItem()
            item['quote'] = quote.css('span.text::text').extract_first()
            item['author'] = quote.css('small.author::text').extract_first()
            item['tags'] = quote.css('div.tags a.tag::text').extract()
            yield item

运转爬虫

$ scrapy crawl quotes24 -o results_2_4_01.json

可以达标第一个爬虫一样的意义。

二、mysql 一个小例子

1.spider: xicidaili.py

# -*- coding: utf-8 -*-
import scrapy
from xiciip.items import XiciipItem

class XicidailiSpider(scrapy.Spider):
    name = "xicidaili"
    allowed_domains = ["xicidaili.com"]
    #start_urls = ['http://zhangjiakou.ganji.com']

    headers={
        "User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36"
    }

    def start_requests(self):
        reqs=[]

        for i in range(1,3):
            req=scrapy.Request("http://www.xicidaili.com/nn/%s"%i,headers=self.headers)
            reqs.append(req)

        return reqs


    def parse(self, response):
        print ("hahahahahhahah"+response.url)

        pre_item=XiciipItem()
        # pre_item['url']=response.url
        # return pre_item
        ip_list=response.xpath('//table[@id="ip_list"]')

        trs=ip_list[0].xpath('tr')

        items=[]
####string(td[4])   抽取字符串
        for i in trs[1:]:
            pre_item=XiciipItem()
            pre_item["ip"]=i.xpath('td[2]/text()')[0].extract()
            pre_item["port"]=i.xpath('td[3]/text()')[0].extract()
            pre_item["position"]=i.xpath('string(td[4])')[0].extract().strip()
            pre_item["type"]=i.xpath('td[6]/text()')[0].extract()

#####正则取    \. 表示. \d
            pre_item["speed"]=i.xpath('td[7]/div[@class="bar"]/@title').re('\d{0,}\.\d{0,}')[0]
            pre_item["last_check_time"]=i.xpath('td[9]/text()')[0].extract()
            items.append(pre_item)
        return items

2.item.py

import scrapy


class XiciipItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()

    ip=scrapy.Field()
    port=scrapy.Field()
    position=scrapy.Field()
    type=scrapy.Field()
    speed=scrapy.Field()
    last_check_time=scrapy.Field()

3.主菜上了 settings.py

MYSQL_HOSTS = '127.0.0.1'
MYSQL_USER = 'root'
MYSQL_PASSWORD = '******'
#MYSQL_PORT = settings.MYSQL_PORT
MYSQL_DB='xiciip'
CHARSET='utf8'


ITEM_PIPELINES = {
   'xiciip.pipelines.XiciipPipeline': 300,
}

4.pipelines.py

# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html

import MySQLdb

####注意
from scrapy.conf import settings


class XiciipPipeline(object):
    def process_item(self, item, spider):

        # DBKWARGS=spider.settings.get('DBKWARGS')
        # con=MySQLdb.connect(**DBKWARGS)


        host = settings['MYSQL_HOSTS']
        user = settings['MYSQL_USER']
        psd = settings['MYSQL_PASSWORD']
        db = settings['MYSQL_DB']
        c=settings['CHARSET']
#使用的方法2.
        con = MySQLdb.connect(host=host,user=user,passwd=psd,db=db,charset=c)
        #可以使用的方法1
        #con = MySQLdb.connect(host='127.0.0.1',user='root',passwd='******',db='xiciip',charset='utf8')
        cur=con.cursor()
        sql=("insert into proxy(ip,port,position,type,speed,last_check_time) "
             "values(%s,%s,%s,%s,%s,%s)")
    #    sql=('insert into p1(url) values("%s")')
        #sql="insert into p1 values (%s)"
        #list=(item['url'].split(':')[0])
        #list=[item['url']]
        #print('wwwwwwwwwwwwwwww',list,type(list),type('h'))
        list=[item['ip'],item['port'],item['position'],item['type'],item['speed'],item['last_check_time']]

        try:
            cur.execute(sql,list)
        except Exception,e:
            print('Insert error',e)
            con.rollback()

        else:
            con.commit()

        cur.close()
        con.close()

        return item

相关文章

发表评论

电子邮件地址不会被公开。 必填项已用*标注

*
*
Website