everypolitician-scrapers · chrismytton · Nov 8, 2016 · Nov 8, 2016 · Nov 14, 2016 · Nov 14, 2016
diff --git a/.gitignore b/.gitignore
@@ -1,2 +1,3 @@
 # Ignore output of scraper
 data.sqlite
+.cache
diff --git a/Gemfile b/Gemfile
@@ -8,10 +8,6 @@ git_source(:github) { |repo_name| "https://github.com/#{repo_name}.git" }
 ruby "2.3.1"
 
 gem "scraperwiki", github: "openaustralia/scraperwiki-ruby", branch: "morph_defaults"
-gem "nokogiri"
-gem "open-uri-cached"
 gem "pry"
-gem "colorize"
-gem "capybara"
-gem "poltergeist"
-gem 'scraped_page_archive', github: "everypolitician/scraped_page_archive", branch: "master"
+gem 'scraped_page', github: 'everypolitician/scraped_page', branch: 'master'
+gem 'open-uri-cached'
diff --git a/Gemfile.lock b/Gemfile.lock
@@ -1,11 +1,12 @@
 GIT
-  remote: https://github.com/everypolitician/scraped_page_archive.git
-  revision: 9d8a6347d122d42a983ba28ff84d24de6bdca8a0
+  remote: https://github.com/everypolitician/scraped_page.git
+  revision: 6eb806c01e8b9f3e333575a639e40a68be60ae48
   branch: master
   specs:
-    scraped_page_archive (0.4.1)
-      git (~> 1.3.0)
-      vcr-archive (~> 0.3.0)
+    scraped_page (0.1.0)
+      field_serializer
+      nokogiri
+      scraped_page_archive (>= 0.5)
 
 GIT
   remote: https://github.com/openaustralia/scraperwiki-ruby.git
@@ -19,44 +20,29 @@ GIT
 GEM
   remote: https://rubygems.org/
   specs:
-    addressable (2.4.0)
-    capybara (2.6.2)
-      addressable
-      mime-types (>= 1.16)
-      nokogiri (>= 1.3.3)
-      rack (>= 1.0.0)
-      rack-test (>= 0.5.4)
-      xpath (~> 2.0)
-    cliver (0.3.2)
+    addressable (2.5.0)
+      public_suffix (~> 2.0, >= 2.0.2)
     coderay (1.1.1)
-    colorize (0.7.7)
     crack (0.4.3)
       safe_yaml (~> 1.0.0)
+    field_serializer (0.2.0)
     git (1.3.0)
     hashdiff (0.3.0)
     httpclient (2.7.1)
     method_source (0.8.2)
-    mime-types (3.0)
-      mime-types-data (~> 3.2015)
-    mime-types-data (3.2016.0221)
-    mini_portile2 (2.0.0)
-    multi_json (1.11.2)
-    nokogiri (1.6.7.2)
-      mini_portile2 (~> 2.0.0.rc2)
+    mini_portile2 (2.1.0)
+    nokogiri (1.6.8.1)
+      mini_portile2 (~> 2.1.0)
     open-uri-cached (0.0.5)
-    poltergeist (1.9.0)
-      capybara (~> 2.1)
-      cliver (~> 0.3.1)
-      multi_json (~> 1.0)
-      websocket-driver (>= 0.2.0)
     pry (0.10.3)
       coderay (~> 1.1.0)
       method_source (~> 0.8.1)
       slop (~> 3.4)
-    rack (1.6.4)
-    rack-test (0.6.3)
-      rack (>= 1.0)
+    public_suffix (2.0.4)
     safe_yaml (1.0.4)
+    scraped_page_archive (0.5.0)
+      git (~> 1.3.0)
+      vcr-archive (~> 0.3.0)
     slop (3.6.0)
     sqlite3 (1.3.11)
     sqlite_magic (0.0.6)
@@ -69,27 +55,18 @@ GEM
       addressable (>= 2.3.6)
       crack (>= 0.3.2)
       hashdiff
-    websocket-driver (0.6.3)
-      websocket-extensions (>= 0.1.0)
-    websocket-extensions (0.1.2)
-    xpath (2.0.0)
-      nokogiri (~> 1.3)
 
 PLATFORMS
   ruby
 
 DEPENDENCIES
-  capybara
-  colorize
-  nokogiri
   open-uri-cached
-  poltergeist
   pry
-  scraped_page_archive!
+  scraped_page!
   scraperwiki!
 
 RUBY VERSION
    ruby 2.3.1p112
 
 BUNDLED WITH
-   1.13.5
+   1.13.7
diff --git a/lib/core_ext.rb b/lib/core_ext.rb
@@ -0,0 +1,6 @@
+# frozen_string_literal: true
+class String
+  def tidy
+    gsub(/[[:space:]]+/, ' ').strip
+  end
+end
diff --git a/lib/date_of_birth.rb b/lib/date_of_birth.rb
@@ -0,0 +1,25 @@
+# frozen_string_literal: true
+class DateOfBirth
+  DATE_REGEX = /(?<day>\d+) de (?<month>[^[:space:]]*) de (?<year>\d+)/
+
+  def initialize(date_string)
+    @date_string = date_string
+  end
+
+  def to_s
+    return '' if match.nil?
+    '%d-%02d-%02d' % [match[:year], month(match[:month]), match[:day]]
+  end
+
+  private
+
+  attr_reader :date_string
+
+  def match
+    @match ||= date_string.match(DATE_REGEX)
+  end
+
+  def month(str)
+    ['', 'enero', 'febrero', 'marzo', 'abril', 'mayo', 'junio', 'julio', 'agosto', 'septiembre', 'octubre', 'noviembre', 'diciembre'].find_index(str.downcase) || raise("Unknown month #{str}".magenta)
+  end
+end
diff --git a/lib/member_page.rb b/lib/member_page.rb
@@ -0,0 +1,115 @@
+# frozen_string_literal: true
+require_relative 'spanish_congress_page'
+require_relative 'date_of_birth'
+require_relative 'core_ext'
+
+class MemberPage < SpanishCongressPage
+  field :iddiputado do
+    query['idDiputado']
+  end
+
+  field :term do
+    query['idLegislatura']
+  end
+
+  field :name do
+    noko.css('div#curriculum div.nombre_dip').text
+  end
+
+  field :family_names do
+    name.split(/,/).first.to_s.tidy
+  end
+
+  field :given_names do
+    name.split(/,/).last.to_s.tidy
+  end
+
+  field :gender do
+    return 'female' if seat.include? 'Diputada'
+    return 'male' if seat.include? 'Diputado'
+  end
+
+  field :party do
+    noko.at_css('#datos_diputado .nombre_grupo').text.tidy
+  end
+
+  field :source do
+    url.to_s
+  end
+
+  field :dob do
+    DateOfBirth.new(
+      noko.xpath('.//div[@class="titular_historico"]/following::div/ul/li').first.text
+    ).to_s
+  end
+
+  field :faction do
+    faction_information[:faction].to_s.tidy
+  end
+
+  field :faction_id do
+    faction_information[:faction_id].to_s.tidy
+  end
+
+  field :start_date do
+    start_date = noko.xpath('.//div[@class="dip_rojo"][contains(.,"Fecha alta")]')
+                     .text.match(/(\d+)\/(\d+)\/(\d+)\./)
+    return if start_date.nil?
+    start_date.captures.reverse.join('-')
+  end
+
+  field :end_date do
+    end_date = noko.xpath('.//div[@class="dip_rojo"][contains(.,"Causó baja")]')
+                   .text.match(/(\d+)\/(\d+)\/(\d+)\./)
+    return if end_date.nil?
+    end_date.captures.reverse.join('-')
+  end
+
+  field :email do
+    noko.css('.webperso_dip a[href*="mailto"]').text.tidy
+  end
+
+  field :twitter do
+    noko.css('.webperso_dip a[href*="twitter.com"]').text.tidy
+  end
+
+  field :facebook do
+    noko.css('.webperso_dip a[href*="facebook.com"]').text.tidy
+  end
+
+  field :phone do
+    noko.css('.texto_dip').text.match(/Teléfono: (.*)$/).to_a.last.to_s.tidy
+  end
+
+  field :fax do
+    noko.css('.texto_dip').text.match(/Fax: (.*)$/).to_a.last.to_s.tidy
+  end
+
+  field :constituency do
+    seat[/Diputad. por (.*)\./, 1]
+  end
+
+  field :photo do
+    foto = noko.at_css('#datos_diputado img[name="foto"]')
+    return if foto.nil?
+    URI.join(url, foto[:src]).to_s
+  end
+
+  private
+
+  def seat
+    @seat ||= noko.at_css('div#curriculum div.texto_dip ul li div.dip_rojo:first').text.tidy
+  end
+
+  def group
+    @group ||= noko.at_css('div#curriculum div.texto_dip ul li div.dip_rojo:last').text.tidy
+  end
+
+  def query
+    @query ||= URI.decode_www_form(URI.parse(url).query).to_h
+  end
+
+  def faction_information
+    @faction_information ||= group.match(/(?<faction>.*?) \((?<faction_id>.*?)\)/) || {}
+  end
+end
diff --git a/lib/members_list_page.rb b/lib/members_list_page.rb
@@ -0,0 +1,16 @@
+# frozen_string_literal: true
+require_relative 'spanish_congress_page'
+
+class MembersListPage < SpanishCongressPage
+  def member_urls
+    @member_urls ||= noko.css('div#RESULTADOS_DIPUTADOS div.listado_1 ul li a').map { |p| p[:href] }
+  end
+
+  def next_page_url
+    next_page_link && next_page_link[:href]
+  end
+
+  def next_page_link
+    @next_page_url ||= noko.css('//div[@class = "paginacion"]//a[contains("Página Siguiente")]').first
+  end
+end
diff --git a/lib/spanish_congress_page.rb b/lib/spanish_congress_page.rb
@@ -0,0 +1,13 @@
+# frozen_string_literal: true
+require 'scraped_page'
+require 'uri'
+
+class SpanishCongressPage < ScrapedPage
+  # Remove session information from url
+  def url
+    uri = URI.parse(super.to_s)
+    return uri.to_s unless uri.query
+    uri.query = uri.query.gsub(/_piref[\d_]+\./, '')
+    uri.to_s
+  end
+end