csdn博客搬家

# -*- coding: utf-8 -*-
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector
from scrapy.http.request import Request
import urlparse
from csdn.items import CsdnItem

class csdnSpider( CrawlSpider ):
   name = "csdn"
   allowed_domains = [“www.csdn.net”]
   start_urls = [
           "http://blog.csdn.net/huangxiansheng1980/article/list/1",
           "http://blog.csdn.net/huangxiansheng1980/article/list/2",
           "http://blog.csdn.net/huangxiansheng1980/article/list/3",
           "http://blog.csdn.net/huangxiansheng1980/article/list/4",
           "http://blog.csdn.net/huangxiansheng1980/article/list/5",
           "http://blog.csdn.net/huangxiansheng1980/article/list/6",
           "http://blog.csdn.net/huangxiansheng1980/article/list/7",
           "http://blog.csdn.net/huangxiansheng1980/article/list/8",
           "http://blog.csdn.net/huangxiansheng1980/article/list/9",
           "http://blog.csdn.net/huangxiansheng1980/article/list/10",
           "http://blog.csdn.net/huangxiansheng1980/article/list/11",
           "http://blog.csdn.net/huangxiansheng1980/article/list/12",
           "http://blog.csdn.net/huangxiansheng1980/article/list/13",
           "http://blog.csdn.net/huangxiansheng1980/article/list/14",
           "http://blog.csdn.net/huangxiansheng1980/article/list/15",
           "http://blog.csdn.net/huangxiansheng1980/article/list/16",
           "http://blog.csdn.net/huangxiansheng1980/article/list/17",
           "http://blog.csdn.net/huangxiansheng1980/article/list/18",
           "http://blog.csdn.net/huangxiansheng1980/article/list/19",
           "http://blog.csdn.net/huangxiansheng1980/article/list/20",
           "http://blog.csdn.net/huangxiansheng1980/article/list/21",
           "http://blog.csdn.net/huangxiansheng1980/article/list/22",
           "http://blog.csdn.net/huangxiansheng1980/article/list/23",
           "http://blog.csdn.net/huangxiansheng1980/article/list/24",
           "http://blog.csdn.net/huangxiansheng1980/article/list/25",
           "http://blog.csdn.net/huangxiansheng1980/article/list/26",
           "http://blog.csdn.net/huangxiansheng1980/article/list/27",
           "http://blog.csdn.net/huangxiansheng1980/article/list/28",
           "http://blog.csdn.net/huangxiansheng1980/article/list/29",
           "http://blog.csdn.net/huangxiansheng1980/article/list/30",
           "http://blog.csdn.net/huangxiansheng1980/article/list/31",
           "http://blog.csdn.net/huangxiansheng1980/article/list/32",
           "http://blog.csdn.net/huangxiansheng1980/article/list/33",
           "http://blog.csdn.net/huangxiansheng1980/article/list/34",
           "http://blog.csdn.net/huangxiansheng1980/article/list/35",
           "http://blog.csdn.net/huangxiansheng1980/article/list/36",
           "http://blog.csdn.net/huangxiansheng1980/article/list/37",
           "http://blog.csdn.net/huangxiansheng1980/article/list/38",
           ]
   rules = (
           Rule(SgmlLinkExtractor(allow=('details/12513065',
                   ),), callback='parse_item', follow=True),
           )

   def parse_item( self, response ):
       print '++++++++crawling ' + response.url
       links = []
       hxs = HtmlXPathSelector(response)
       blog_titles = hxs.select('//h3/span/a/text()')
       blog_links = hxs.select('//h3/span/a/@href')
       #next_page_flags = hxs.select('//div[@id=”papelist”]/a/text()')
       #titles = ''
       #links = ''
       #flags = ''
       #for title in blog_titles:
       #   titles = titles + title.extract()
       #for link in blog_links:
       #   print '——–crawling ' + urlparse.urljoin(response.url, link.extract())
       #   yield Request( urlparse.urljoin(response.url, link.extract()), meta={}, callback=self.parse_item )
       #for flag in next_page_flags:
       #   flags = flags + flag.extract()
       filename = response.url.split("/")[-2]
       open( filename, 'wb' ).write( response.body )
       #open( 'all', 'w' ).write( titles.encode( 'utf-8' ))
       #open( 'all', 'a' ).write( links )
       #open( 'all', 'a' ).write( flags.encode( 'utf-8'))
       item = CsdnItem()
       item.title = 'test'
       item.content = 'fdfjdkf'
       return item

spider = csdnSpider()

More from my site

分享到：

More from my site

留言 取消

留言取消