From 42c6c3e9d73b89561115d62cdc7b1c834dd5ddb6 Mon Sep 17 00:00:00 2001
From: Chris Mytton <chrismytton@gmail.com>
Date: Tue, 8 Nov 2016 12:17:44 +0000
Subject: [PATCH 01/20] Add scraped_page dependency

---
 Gemfile      |  1 +
 Gemfile.lock | 28 +++++++++++++++++++++-------
 2 files changed, 22 insertions(+), 7 deletions(-)

diff --git a/Gemfile b/Gemfile
index ff39898ce..c81e1866f 100644
--- a/Gemfile
+++ b/Gemfile
@@ -15,3 +15,4 @@ gem "colorize"
 gem "capybara"
 gem "poltergeist"
 gem 'scraped_page_archive', github: "everypolitician/scraped_page_archive", branch: "master"
+gem 'scraped_page', github: 'everypolitician/scraped_page', branch: 'master'
diff --git a/Gemfile.lock b/Gemfile.lock
index 6546341dc..5a267447d 100644
--- a/Gemfile.lock
+++ b/Gemfile.lock
@@ -1,9 +1,19 @@
+GIT
+  remote: https://github.com/everypolitician/scraped_page.git
+  revision: 6eb806c01e8b9f3e333575a639e40a68be60ae48
+  branch: master
+  specs:
+    scraped_page (0.1.0)
+      field_serializer
+      nokogiri
+      scraped_page_archive (>= 0.5)
+
 GIT
   remote: https://github.com/everypolitician/scraped_page_archive.git
-  revision: 9d8a6347d122d42a983ba28ff84d24de6bdca8a0
+  revision: c4056f6029f284a326e56c129766e397995e55f7
   branch: master
   specs:
-    scraped_page_archive (0.4.1)
+    scraped_page_archive (0.5.0)
       git (~> 1.3.0)
       vcr-archive (~> 0.3.0)
 
@@ -19,7 +29,8 @@ GIT
 GEM
   remote: https://rubygems.org/
   specs:
-    addressable (2.4.0)
+    addressable (2.5.0)
+      public_suffix (~> 2.0, >= 2.0.2)
     capybara (2.6.2)
       addressable
       mime-types (>= 1.16)
@@ -32,6 +43,7 @@ GEM
     colorize (0.7.7)
     crack (0.4.3)
       safe_yaml (~> 1.0.0)
+    field_serializer (0.2.0)
     git (1.3.0)
     hashdiff (0.3.0)
     httpclient (2.7.1)
@@ -39,10 +51,10 @@ GEM
     mime-types (3.0)
       mime-types-data (~> 3.2015)
     mime-types-data (3.2016.0221)
-    mini_portile2 (2.0.0)
+    mini_portile2 (2.1.0)
     multi_json (1.11.2)
-    nokogiri (1.6.7.2)
-      mini_portile2 (~> 2.0.0.rc2)
+    nokogiri (1.6.8.1)
+      mini_portile2 (~> 2.1.0)
     open-uri-cached (0.0.5)
     poltergeist (1.9.0)
       capybara (~> 2.1)
@@ -53,6 +65,7 @@ GEM
       coderay (~> 1.1.0)
       method_source (~> 0.8.1)
       slop (~> 3.4)
+    public_suffix (2.0.4)
     rack (1.6.4)
     rack-test (0.6.3)
       rack (>= 1.0)
@@ -85,6 +98,7 @@ DEPENDENCIES
   open-uri-cached
   poltergeist
   pry
+  scraped_page!
   scraped_page_archive!
   scraperwiki!
 
@@ -92,4 +106,4 @@ RUBY VERSION
    ruby 2.3.1p112
 
 BUNDLED WITH
-   1.13.5
+   1.13.6

From a2da5d1adc9fecd919538acfc62ceea95ae89385 Mon Sep 17 00:00:00 2001
From: Chris Mytton <chrismytton@gmail.com>
Date: Tue, 8 Nov 2016 14:27:15 +0000
Subject: [PATCH 02/20] WIP: Refactor scraper to use ScrapedPage classes

---
 lib/member_page.rb       | 73 ++++++++++++++++++++++++++++++++++++++++
 lib/members_list_page.rb | 11 ++++++
 scraper.rb               | 14 ++++++--
 3 files changed, 96 insertions(+), 2 deletions(-)
 create mode 100644 lib/member_page.rb
 create mode 100644 lib/members_list_page.rb

diff --git a/lib/member_page.rb b/lib/member_page.rb
new file mode 100644
index 000000000..a2e052716
--- /dev/null
+++ b/lib/member_page.rb
@@ -0,0 +1,73 @@
+require 'scraped_page'
+
+class String
+  def tidy
+    self.gsub(/[[:space:]]+/, ' ').strip
+  end
+end
+
+class MemberPage < ScrapedPage
+  # Remove session information from url
+  def url
+    super.to_s.match(/(.*)_piref[\d_]+\.(next_page.*)/).captures.join('')
+  end
+
+  field :iddiputado do
+    query['idDiputado']
+  end
+
+  field :term do
+    query['idLegislatura']
+  end
+
+  field :name do
+    noko.css('div#curriculum div.nombre_dip').text
+  end
+
+  field :family_names do
+    name.split(/,/).first.tidy
+  end
+
+  field :given_names do
+    name.split(/,/).last.tidy
+  end
+
+  field :gender do
+    return 'female' if seat.include? 'Diputada'
+    return 'male' if seat.include? 'Diputado'
+  end
+
+  field :source do
+    url.to_s
+  end
+
+  field :dob do
+    # TODO: Make a class to encapsulate parsing the DOB.
+  end
+
+  field :faction do
+    faction_information[:faction].tidy
+  end
+
+  field :faction_id do
+    faction_information[:faction_id].tidy
+  end
+
+  private
+
+  def seat
+    @seat ||= noko.at_css('div#curriculum div.texto_dip ul li div.dip_rojo:first').text.tidy
+  end
+
+  def group
+    @group ||= noko.at_css('div#curriculum div.texto_dip ul li div.dip_rojo:last').text.tidy
+  end
+
+  def query
+    @query ||= URI.decode_www_form(URI.parse(url).query).to_h
+  end
+
+  def faction_information
+    @faction_information ||= group.match(/(?<faction>.*?) \((?<faction_id>.*?)\)/)
+  end
+end
diff --git a/lib/members_list_page.rb b/lib/members_list_page.rb
new file mode 100644
index 000000000..29c26c53b
--- /dev/null
+++ b/lib/members_list_page.rb
@@ -0,0 +1,11 @@
+require 'scraped_page'
+
+class MembersListPage < ScrapedPage
+  def member_urls
+    @member_urls ||= noko.css('div#RESULTADOS_DIPUTADOS div.listado_1 ul li a').map { |p| p[:href] }
+  end
+
+  def next_page_url
+    @next_page_url ||= noko.css('//div[@class = "paginacion"]//a[contains("Página Siguiente")]').first[:href]
+  end
+end
diff --git a/scraper.rb b/scraper.rb
index f30fe3e47..c04e8513b 100644
--- a/scraper.rb
+++ b/scraper.rb
@@ -286,5 +286,15 @@ def scrape_person(term, url)
   ScraperWiki.save_sqlite([:id, :term], data)
 end
 
-scrape_people('http://www.congreso.es/portal/page/portal/Congreso/Congreso/Diputados/DiputadosTodasLegislaturas')
-scrape_memberships()
+# scrape_people('http://www.congreso.es/portal/page/portal/Congreso/Congreso/Diputados/DiputadosTodasLegislaturas')
+# scrape_memberships()
+
+require_relative 'lib/members_list_page'
+require_relative 'lib/member_page'
+
+start_url = 'http://www.congreso.es/portal/page/portal/Congreso/Congreso/Diputados/DiputadosTodasLegislaturas'
+
+MembersListPage.new(url: start_url).member_urls.each do |member_url|
+  member = MemberPage.new(url: URI.join(start_url, member_url))
+  ScraperWiki.save_sqlite([:name, :term], member.to_h)
+end

From 454a4a5fd6aef83b77393e9b8a5deb29508947d3 Mon Sep 17 00:00:00 2001
From: Chris Mytton <chrismytton@gmail.com>
Date: Mon, 14 Nov 2016 13:30:13 +0100
Subject: [PATCH 03/20] Add class to represent members' DOB

---
 lib/member_page.rb | 29 ++++++++++++++++++++++++++++-
 1 file changed, 28 insertions(+), 1 deletion(-)

diff --git a/lib/member_page.rb b/lib/member_page.rb
index a2e052716..f603e7239 100644
--- a/lib/member_page.rb
+++ b/lib/member_page.rb
@@ -7,6 +7,31 @@ def tidy
 end
 
 class MemberPage < ScrapedPage
+  class DateOfBirth
+    DATE_REGEX = /(?<day>\d+) de (?<month>[^[:space:]]*) de (?<year>\d+)/
+
+    def initialize(date_string)
+      @date_string = date_string
+    end
+
+    def to_s
+      return if match.nil?
+      "%d-%02d-%02d" % [ match[:year], month(match[:month]), match[:day] ]
+    end
+
+    private
+
+    attr_reader :date_string
+
+    def match
+      @match ||= date_string.match(DATE_REGEX)
+    end
+
+    def month(str)
+      ['','enero','febrero','marzo','abril','mayo','junio','julio','agosto','septiembre','octubre','noviembre','diciembre'].find_index(str.downcase) or raise "Unknown month #{str}".magenta
+    end
+  end
+
   # Remove session information from url
   def url
     super.to_s.match(/(.*)_piref[\d_]+\.(next_page.*)/).captures.join('')
@@ -42,7 +67,9 @@ def url
   end
 
   field :dob do
-    # TODO: Make a class to encapsulate parsing the DOB.
+    DateOfBirth.new(
+      noko.xpath('.//div[@class="titular_historico"]/following::div/ul/li').first.text
+    ).to_s
   end
 
   field :faction do

From 44001c8f90e8f4099e4c127dca50db49a039e231 Mon Sep 17 00:00:00 2001
From: Chris Mytton <chrismytton@gmail.com>
Date: Mon, 14 Nov 2016 13:30:30 +0100
Subject: [PATCH 04/20] Add party field

---
 lib/member_page.rb | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/lib/member_page.rb b/lib/member_page.rb
index f603e7239..cd5035dc7 100644
--- a/lib/member_page.rb
+++ b/lib/member_page.rb
@@ -62,6 +62,10 @@ def url
     return 'male' if seat.include? 'Diputado'
   end
 
+  field :party do
+    noko.at_css('#datos_diputado .nombre_grupo').text.tidy
+  end
+
   field :source do
     url.to_s
   end

From e08861de54d3f60ecceda09f97fb8688775f122c Mon Sep 17 00:00:00 2001
From: Chris Mytton <chrismytton@gmail.com>
Date: Mon, 14 Nov 2016 16:44:17 +0100
Subject: [PATCH 05/20] Add remaining methods to MemberPage class

These pick up the other fields that the existing scraper was getting.
---
 lib/member_page.rb | 44 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 44 insertions(+)

diff --git a/lib/member_page.rb b/lib/member_page.rb
index cd5035dc7..efc437517 100644
--- a/lib/member_page.rb
+++ b/lib/member_page.rb
@@ -84,6 +84,50 @@ def url
     faction_information[:faction_id].tidy
   end
 
+  field :start_date do
+    start_date = noko.xpath('.//div[@class="dip_rojo"][contains(.,"Fecha alta")]')
+      .text.match(/(\d+)\/(\d+)\/(\d+)\./)
+    return if start_date.nil?
+    start_date.captures.reverse.join('-')
+  end
+
+  field :end_date do
+    end_date = noko.xpath('.//div[@class="dip_rojo"][contains(.,"Causó baja")]')
+      .text.match(/(\d+)\/(\d+)\/(\d+)\./)
+    return if end_date.nil?
+    end_date.captures.reverse.join('-')
+  end
+
+  field :email do
+    noko.css('.webperso_dip a[href*="mailto"]').text.tidy
+  end
+
+  field :twitter do
+    noko.css('.webperso_dip a[href*="twitter.com"]').text.tidy
+  end
+
+  field :facebook do
+    noko.css('.webperso_dip a[href*="facebook.com"]').text.tidy
+  end
+
+  field :phone do
+    noko.css('.texto_dip').map(&:text).join('').match(/Teléfono: (.*)$/).to_a.last.to_s.tidy
+  end
+
+  field :fax do
+    noko.css('.texto_dip').map(&:text).join('').match(/Fax: (.*)$/).to_a.last.to_s.tidy
+  end
+
+  field :constituency do
+    seat[/Diputad. por (.*)\./, 1]
+  end
+
+  field :photo do
+    foto = noko.at_css('#datos_diputado img[name="foto"]')
+    return if foto.nil?
+    URI.join(url, foto[:src]).to_s
+  end
+
   private
 
   def seat

From a36bbcc77cb4bb28fcbfb8fa5e90e7b5a3b2d4a5 Mon Sep 17 00:00:00 2001
From: Chris Mytton <chrismytton@gmail.com>
Date: Mon, 14 Nov 2016 17:31:12 +0100
Subject: [PATCH 06/20] Move DateOfBirth class into its own file

---
 lib/date_of_birth.rb | 25 +++++++++++++++++++++++++
 lib/member_page.rb   | 26 +-------------------------
 2 files changed, 26 insertions(+), 25 deletions(-)
 create mode 100644 lib/date_of_birth.rb

diff --git a/lib/date_of_birth.rb b/lib/date_of_birth.rb
new file mode 100644
index 000000000..9f922073f
--- /dev/null
+++ b/lib/date_of_birth.rb
@@ -0,0 +1,25 @@
+# frozen_string_literal: true
+class DateOfBirth
+  DATE_REGEX = /(?<day>\d+) de (?<month>[^[:space:]]*) de (?<year>\d+)/
+
+  def initialize(date_string)
+    @date_string = date_string
+  end
+
+  def to_s
+    return if match.nil?
+    "%d-%02d-%02d" % [ match[:year], month(match[:month]), match[:day] ]
+  end
+
+  private
+
+  attr_reader :date_string
+
+  def match
+    @match ||= date_string.match(DATE_REGEX)
+  end
+
+  def month(str)
+    ['', 'enero', 'febrero', 'marzo', 'abril', 'mayo', 'junio', 'julio', 'agosto', 'septiembre', 'octubre', 'noviembre', 'diciembre'].find_index(str.downcase) || raise("Unknown month #{str}".magenta)
+  end
+end
diff --git a/lib/member_page.rb b/lib/member_page.rb
index efc437517..b6860e7ae 100644
--- a/lib/member_page.rb
+++ b/lib/member_page.rb
@@ -1,4 +1,5 @@
 require 'scraped_page'
+require_relative 'date_of_birth'
 
 class String
   def tidy
@@ -7,31 +8,6 @@ def tidy
 end
 
 class MemberPage < ScrapedPage
-  class DateOfBirth
-    DATE_REGEX = /(?<day>\d+) de (?<month>[^[:space:]]*) de (?<year>\d+)/
-
-    def initialize(date_string)
-      @date_string = date_string
-    end
-
-    def to_s
-      return if match.nil?
-      "%d-%02d-%02d" % [ match[:year], month(match[:month]), match[:day] ]
-    end
-
-    private
-
-    attr_reader :date_string
-
-    def match
-      @match ||= date_string.match(DATE_REGEX)
-    end
-
-    def month(str)
-      ['','enero','febrero','marzo','abril','mayo','junio','julio','agosto','septiembre','octubre','noviembre','diciembre'].find_index(str.downcase) or raise "Unknown month #{str}".magenta
-    end
-  end
-
   # Remove session information from url
   def url
     super.to_s.match(/(.*)_piref[\d_]+\.(next_page.*)/).captures.join('')

From 7c1c583e47b6333e44892399d9c3bfa589e7dd21 Mon Sep 17 00:00:00 2001
From: Chris Mytton <chrismytton@gmail.com>
Date: Mon, 14 Nov 2016 17:33:57 +0100
Subject: [PATCH 07/20] Handle missing faction information

---
 lib/member_page.rb | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/lib/member_page.rb b/lib/member_page.rb
index b6860e7ae..924a7d0c8 100644
--- a/lib/member_page.rb
+++ b/lib/member_page.rb
@@ -53,11 +53,11 @@ def url
   end
 
   field :faction do
-    faction_information[:faction].tidy
+    faction_information[:faction].to_s.tidy
   end
 
   field :faction_id do
-    faction_information[:faction_id].tidy
+    faction_information[:faction_id].to_s.tidy
   end
 
   field :start_date do
@@ -119,6 +119,6 @@ def query
   end
 
   def faction_information
-    @faction_information ||= group.match(/(?<faction>.*?) \((?<faction_id>.*?)\)/)
+    @faction_information ||= group.match(/(?<faction>.*?) \((?<faction_id>.*?)\)/) || {}
   end
 end

From 14bea9d35b509fcc8121cd0a7f8f154e6595e092 Mon Sep 17 00:00:00 2001
From: Chris Mytton <chrismytton@gmail.com>
Date: Mon, 14 Nov 2016 17:35:53 +0100
Subject: [PATCH 08/20] More robust url session handling

Use the URI module to make it easier to just replace the unwanted part
of the query string.
---
 lib/member_page.rb | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/lib/member_page.rb b/lib/member_page.rb
index 924a7d0c8..a62689897 100644
--- a/lib/member_page.rb
+++ b/lib/member_page.rb
@@ -10,7 +10,10 @@ def tidy
 class MemberPage < ScrapedPage
   # Remove session information from url
   def url
-    super.to_s.match(/(.*)_piref[\d_]+\.(next_page.*)/).captures.join('')
+    uri = URI.parse(super.to_s)
+    return uri.to_s unless uri.query
+    uri.query = uri.query.gsub(/_piref[\d_]+\./, '')
+    uri.to_s
   end
 
   field :iddiputado do

From d48f099c9617ec4af6dc9763ce5406825859eea9 Mon Sep 17 00:00:00 2001
From: Chris Mytton <chrismytton@gmail.com>
Date: Mon, 14 Nov 2016 17:39:01 +0100
Subject: [PATCH 09/20] Extract a SpanishCongressPage base class

This means that rather than having to override the url in every class in
the system we can just do it once here and then inherit from this class.
---
 lib/member_page.rb           | 12 ++----------
 lib/spanish_congress_page.rb | 12 ++++++++++++
 2 files changed, 14 insertions(+), 10 deletions(-)
 create mode 100644 lib/spanish_congress_page.rb

diff --git a/lib/member_page.rb b/lib/member_page.rb
index a62689897..2430eb08e 100644
--- a/lib/member_page.rb
+++ b/lib/member_page.rb
@@ -1,4 +1,4 @@
-require 'scraped_page'
+require_relative 'spanish_congress_page'
 require_relative 'date_of_birth'
 
 class String
@@ -7,15 +7,7 @@ def tidy
   end
 end
 
-class MemberPage < ScrapedPage
-  # Remove session information from url
-  def url
-    uri = URI.parse(super.to_s)
-    return uri.to_s unless uri.query
-    uri.query = uri.query.gsub(/_piref[\d_]+\./, '')
-    uri.to_s
-  end
-
+class MemberPage < SpanishCongressPage
   field :iddiputado do
     query['idDiputado']
   end
diff --git a/lib/spanish_congress_page.rb b/lib/spanish_congress_page.rb
new file mode 100644
index 000000000..a71aa069d
--- /dev/null
+++ b/lib/spanish_congress_page.rb
@@ -0,0 +1,12 @@
+require 'scraped_page'
+require 'uri'
+
+class SpanishCongressPage < ScrapedPage
+  # Remove session information from url
+  def url
+    uri = URI.parse(super.to_s)
+    return uri.to_s unless uri.query
+    uri.query = uri.query.gsub(/_piref[\d_]+\./, '')
+    uri.to_s
+  end
+end

From 5485aae9496475c4e383fb8194a609aeba082925 Mon Sep 17 00:00:00 2001
From: Chris Mytton <chrismytton@gmail.com>
Date: Mon, 14 Nov 2016 17:40:21 +0100
Subject: [PATCH 10/20] Move String#tidy into its own file

This should become part of ScapedPage RSN.
---
 lib/core_ext.rb    | 6 ++++++
 lib/member_page.rb | 7 +------
 2 files changed, 7 insertions(+), 6 deletions(-)
 create mode 100644 lib/core_ext.rb

diff --git a/lib/core_ext.rb b/lib/core_ext.rb
new file mode 100644
index 000000000..d701f70d4
--- /dev/null
+++ b/lib/core_ext.rb
@@ -0,0 +1,6 @@
+# frozen_string_literal: true
+class String
+  def tidy
+    gsub(/[[:space:]]+/, ' ').strip
+  end
+end
diff --git a/lib/member_page.rb b/lib/member_page.rb
index 2430eb08e..b4b3a88ef 100644
--- a/lib/member_page.rb
+++ b/lib/member_page.rb
@@ -1,11 +1,6 @@
 require_relative 'spanish_congress_page'
 require_relative 'date_of_birth'
-
-class String
-  def tidy
-    self.gsub(/[[:space:]]+/, ' ').strip
-  end
-end
+require_relative 'core_ext'
 
 class MemberPage < SpanishCongressPage
   field :iddiputado do

From b3b1096246bc7923c5520bcb976be28eda914966 Mon Sep 17 00:00:00 2001
From: Chris Mytton <chrismytton@gmail.com>
Date: Mon, 14 Nov 2016 17:41:12 +0100
Subject: [PATCH 11/20] Make MembersListPage inherit from SpanishCongressPage

This means these pages also have the session information stripped from
the url.
---
 lib/members_list_page.rb | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/lib/members_list_page.rb b/lib/members_list_page.rb
index 29c26c53b..c68ce99c0 100644
--- a/lib/members_list_page.rb
+++ b/lib/members_list_page.rb
@@ -1,6 +1,6 @@
-require 'scraped_page'
+require_relative 'spanish_congress_page'
 
-class MembersListPage < ScrapedPage
+class MembersListPage < SpanishCongressPage
   def member_urls
     @member_urls ||= noko.css('div#RESULTADOS_DIPUTADOS div.listado_1 ul li a').map { |p| p[:href] }
   end

From c7163ee817df920c7567b5fa2e6ffe4bb322253d Mon Sep 17 00:00:00 2001
From: Chris Mytton <chrismytton@gmail.com>
Date: Mon, 14 Nov 2016 17:41:40 +0100
Subject: [PATCH 12/20] Handle pagination in scraper.rb

Rather than just scraping the first page, scrape all pages.
---
 scraper.rb | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/scraper.rb b/scraper.rb
index c04e8513b..eb9933ea8 100644
--- a/scraper.rb
+++ b/scraper.rb
@@ -292,9 +292,14 @@ def scrape_person(term, url)
 require_relative 'lib/members_list_page'
 require_relative 'lib/member_page'
 
-start_url = 'http://www.congreso.es/portal/page/portal/Congreso/Congreso/Diputados/DiputadosTodasLegislaturas'
+url = 'http://www.congreso.es/portal/page/portal/Congreso/Congreso/Diputados/DiputadosTodasLegislaturas'
 
-MembersListPage.new(url: start_url).member_urls.each do |member_url|
-  member = MemberPage.new(url: URI.join(start_url, member_url))
-  ScraperWiki.save_sqlite([:name, :term], member.to_h)
+loop do
+  page = MembersListPage.new(url: url)
+  page.member_urls.each do |member_url|
+    member = MemberPage.new(url: URI.join(url, member_url))
+    ScraperWiki.save_sqlite([:name, :term], member.to_h)
+  end
+  url = page.next_page_url
+  break if url.nil?
 end

From 0e158b982524fcfe4be6e9a2034a5a90e1e6fa1c Mon Sep 17 00:00:00 2001
From: Chris Mytton <chrismytton@gmail.com>
Date: Mon, 14 Nov 2016 20:18:07 +0100
Subject: [PATCH 13/20] Convert name parts to string before tidying

---
 lib/member_page.rb | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/lib/member_page.rb b/lib/member_page.rb
index b4b3a88ef..9b0e16300 100644
--- a/lib/member_page.rb
+++ b/lib/member_page.rb
@@ -16,11 +16,11 @@ class MemberPage < SpanishCongressPage
   end
 
   field :family_names do
-    name.split(/,/).first.tidy
+    name.split(/,/).first.to_s.tidy
   end
 
   field :given_names do
-    name.split(/,/).last.tidy
+    name.split(/,/).last.to_s.tidy
   end
 
   field :gender do

From cddfd484e4be37de5e2454fd6dc2591e97d2d161 Mon Sep 17 00:00:00 2001
From: Chris Mytton <chrismytton@gmail.com>
Date: Mon, 14 Nov 2016 20:30:43 +0100
Subject: [PATCH 14/20] Correctly handle the last page of results

On the last page there is no next link, so trying to call `next[:href]`
blows up. Instead I've moved `next_page_link` to a separate method then
checking that exists in `next_page_url`.
---
 lib/members_list_page.rb | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/lib/members_list_page.rb b/lib/members_list_page.rb
index c68ce99c0..9f68a3c8f 100644
--- a/lib/members_list_page.rb
+++ b/lib/members_list_page.rb
@@ -6,6 +6,10 @@ def member_urls
   end
 
   def next_page_url
-    @next_page_url ||= noko.css('//div[@class = "paginacion"]//a[contains("Página Siguiente")]').first[:href]
+    next_page_link && next_page_link[:href]
+  end
+
+  def next_page_link
+    @next_page_url ||= noko.css('//div[@class = "paginacion"]//a[contains("Página Siguiente")]').first
   end
 end

From 80b4a12e4f2adfefded7f06d551cbe748c07ce5f Mon Sep 17 00:00:00 2001
From: Chris Mytton <chrismytton@gmail.com>
Date: Mon, 14 Nov 2016 20:44:46 +0100
Subject: [PATCH 15/20] Simplify phone and fax fields

---
 lib/member_page.rb | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/lib/member_page.rb b/lib/member_page.rb
index 9b0e16300..a657dd7e4 100644
--- a/lib/member_page.rb
+++ b/lib/member_page.rb
@@ -77,11 +77,11 @@ class MemberPage < SpanishCongressPage
   end
 
   field :phone do
-    noko.css('.texto_dip').map(&:text).join('').match(/Teléfono: (.*)$/).to_a.last.to_s.tidy
+    noko.css('.texto_dip').text.match(/Teléfono: (.*)$/).to_a.last.to_s.tidy
   end
 
   field :fax do
-    noko.css('.texto_dip').map(&:text).join('').match(/Fax: (.*)$/).to_a.last.to_s.tidy
+    noko.css('.texto_dip').text.match(/Fax: (.*)$/).to_a.last.to_s.tidy
   end
 
   field :constituency do

From c25c4b0ebbcb2396b12ba3f4fd281e3aff42c138 Mon Sep 17 00:00:00 2001
From: Chris Mytton <chrismytton@gmail.com>
Date: Mon, 14 Nov 2016 20:45:22 +0100
Subject: [PATCH 16/20] DateOfBirth#to_s should always return a string

---
 lib/date_of_birth.rb | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/lib/date_of_birth.rb b/lib/date_of_birth.rb
index 9f922073f..0e1adcd01 100644
--- a/lib/date_of_birth.rb
+++ b/lib/date_of_birth.rb
@@ -7,8 +7,8 @@ def initialize(date_string)
   end
 
   def to_s
-    return if match.nil?
-    "%d-%02d-%02d" % [ match[:year], month(match[:month]), match[:day] ]
+    return '' if match.nil?
+    '%d-%02d-%02d' % [match[:year], month(match[:month]), match[:day]]
   end
 
   private

From 4fb737a6d43d535fa7349d5c9a83c4b0164727cd Mon Sep 17 00:00:00 2001
From: Chris Mytton <chrismytton@gmail.com>
Date: Mon, 14 Nov 2016 21:02:47 +0100
Subject: [PATCH 17/20] Run rubocop -a on lib

---
 lib/member_page.rb           | 5 +++--
 lib/members_list_page.rb     | 1 +
 lib/spanish_congress_page.rb | 1 +
 3 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/lib/member_page.rb b/lib/member_page.rb
index a657dd7e4..278f9f0a2 100644
--- a/lib/member_page.rb
+++ b/lib/member_page.rb
@@ -1,3 +1,4 @@
+# frozen_string_literal: true
 require_relative 'spanish_congress_page'
 require_relative 'date_of_birth'
 require_relative 'core_ext'
@@ -52,14 +53,14 @@ class MemberPage < SpanishCongressPage
 
   field :start_date do
     start_date = noko.xpath('.//div[@class="dip_rojo"][contains(.,"Fecha alta")]')
-      .text.match(/(\d+)\/(\d+)\/(\d+)\./)
+                     .text.match(/(\d+)\/(\d+)\/(\d+)\./)
     return if start_date.nil?
     start_date.captures.reverse.join('-')
   end
 
   field :end_date do
     end_date = noko.xpath('.//div[@class="dip_rojo"][contains(.,"Causó baja")]')
-      .text.match(/(\d+)\/(\d+)\/(\d+)\./)
+                   .text.match(/(\d+)\/(\d+)\/(\d+)\./)
     return if end_date.nil?
     end_date.captures.reverse.join('-')
   end
diff --git a/lib/members_list_page.rb b/lib/members_list_page.rb
index 9f68a3c8f..5597d4bde 100644
--- a/lib/members_list_page.rb
+++ b/lib/members_list_page.rb
@@ -1,3 +1,4 @@
+# frozen_string_literal: true
 require_relative 'spanish_congress_page'
 
 class MembersListPage < SpanishCongressPage
diff --git a/lib/spanish_congress_page.rb b/lib/spanish_congress_page.rb
index a71aa069d..d9965fb52 100644
--- a/lib/spanish_congress_page.rb
+++ b/lib/spanish_congress_page.rb
@@ -1,3 +1,4 @@
+# frozen_string_literal: true
 require 'scraped_page'
 require 'uri'
 

From b1c18846f2d2b22aa66e9552107007ae432a852a Mon Sep 17 00:00:00 2001
From: Chris Mytton <chrismytton@gmail.com>
Date: Mon, 14 Nov 2016 21:05:32 +0100
Subject: [PATCH 18/20] Delete unused code in scraper.rb

---
 scraper.rb | 292 +----------------------------------------------------
 1 file changed, 2 insertions(+), 290 deletions(-)

diff --git a/scraper.rb b/scraper.rb
index eb9933ea8..fe1406276 100644
--- a/scraper.rb
+++ b/scraper.rb
@@ -1,294 +1,6 @@
-#!/bin/env ruby
-# encoding: utf-8
-
+# frozen_string_literal: true
 require 'scraperwiki'
-require 'capybara'
-require 'capybara/dsl'
-require 'capybara/poltergeist'
-require 'pry'
-require 'scraped_page_archive/capybara'
-
-Capybara.default_max_wait_time = 5
-
-
-# images are very slow to load and cause timeouts and
-# as we don't need them skip
-# Also, some pages have JS errors which we don't care about
-options = {
-    js_errors: false,
-    timeout: 60,
-    phantomjs_options: ['--load-images=no']
-}
-
-Capybara.register_driver :poltergeist do |app|
-  Capybara::Poltergeist::Driver.new(app, options)
-end
-
-include Capybara::DSL
-Capybara.default_driver = :poltergeist
-
-
-class String
-  def tidy
-    self.gsub(/[[:space:]]+/, ' ').strip
-  end
-end
-
-def month(str)
-  ['','enero','febrero','marzo','abril','mayo','junio','julio','agosto','septiembre','octubre','noviembre','diciembre'].find_index(str) or raise "Unknown month #{str}".magenta
-end
-
-def date_of_birth(str)
-  matched = str.match(/(\d+) de ([^[:space:]]*) de (\d+)/) or return
-  day, month, year = matched.captures
-  "%d-%02d-%02d" % [ year, month(month), day ]
-end
-
-def gender_from(seat)
-  return 'female' if seat.include? 'Diputada'
-  return 'male' if seat.include? 'Diputado'
-  return
-end
-
-def save_membership_from_url(name, url)
-  iddiputado = url.to_s.match(/idDiputado=(\d+)/).captures[0]
-  term = url.to_s.match(/idLegislatura=(\d+)/).captures[0]
-  # strip out session id
-  url = url.match(/(.*)_piref[\d_]+\.(next_page.*)/).captures.join('')
-
-  # we can set this to rescrape everything if required
-  unless ENV.key?('MORPH_RESCRAPE_ALL')
-    # don't save data again
-    cur_name = ScraperWiki::select('name FROM memberships WHERE iddiputado is ? AND term is ?', [iddiputado, term]) rescue nil
-    unless cur_name.nil? or cur_name.empty?
-      return
-    end
-  end
-
-  person = {
-    id: 0,
-    name: name.tidy,
-    term: term,
-    iddiputado: iddiputado,
-    url: url
-  }
-
-  ScraperWiki.save_sqlite([:term, :iddiputado], person, 'memberships')
-end
-
-# use the first term they were elected in and the id from that term as the unique id
-# although for people with only one term the page in question seems to fall over so
-# fall back to the current term and id for those people as it's presumably their first
-def get_unique_id(url, page_term, page_iddiputado, name)
-  cur_id = ScraperWiki::select('id FROM memberships WHERE iddiputado is ? AND term is ? and id <> 0', [page_iddiputado, page_term]) rescue nil
-  unless cur_id.nil? or cur_id.empty?
-    return cur_id[0][:id]
-  end
-  sleep(1)
-
-  visit url
-
-  term_map = {}
-  all('div.all_leg').each do |legislature|
-    within(legislature) do
-      term = nil
-      if legislature.has_css?('div.btn_ficha a')
-        link = find('div.btn_ficha a')
-        href = link['href']
-        # we can't do this as one operation as they don't always appear
-        # in the same order :(
-        term = href.to_s.match(/idLegislatura=(\d+)/).captures[0]
-        id = href.to_s.match(/idDiputado=(\d+)/).captures[0]
-        term_map[term.to_i] = id
-        save_membership_from_url(name, href)
-      end
-      if not term.nil? and legislature.has_css?('div.principal')
-        term_div = find('div.principal')
-        name, start_year, end_year = term_div.text.match(/(\w+\s*\w+)\s*\(\s*(\d+)\s*-\s*([^)]*)\)/).captures
-        if end_year.tidy == 'Actualidad'
-          end_year = ''
-        end
-        exists = ScraperWiki::select('id FROM terms WHERE id is ??', [id]) rescue nil
-        if exists.nil?
-          term = {
-            id: term,
-            name: name.tidy,
-            start_date: start_year.tidy,
-            end_date: end_year.tidy,
-            source: 'http://www.congreso.es/',
-          }
-          ScraperWiki.save_sqlite([:id], term, 'terms')
-        end
-      end
-    end
-  end
-
-  # the all terms page seems to be very unreliable so if we can't find what we expect
-  # then we should quite rather than trying to make up an incorrect ID
-  if term_map.empty?
-      return nil
-  end
-
-  min_term = term_map.keys.min
-
-  id = "#{min_term}_#{term_map[min_term]}"
-  for term in term_map.keys
-      ScraperWiki.sqliteexecute('update memberships set id = ? where id = 0 and term = ? and iddiputado = ?', [id, term, term_map[term]])
-  end
-  return id
-end
-
-def scrape_people(url)
-  visit url
-
-  all('div#RESULTADOS_DIPUTADOS div.listado_1 ul li a').each do |link|
-    save_membership_from_url(link.text, link['href'])
-  end
-
-  pagination = all('div.paginacion').first
-  next_page = nil
-  if pagination.has_xpath?(".//a[contains(.,'Página Siguiente')]")
-    within (pagination) do
-      next_page = find(:xpath, ".//a[contains(.,'Página Siguiente')]")
-    end
-  end
-
-  # the website is a bit fragile to lets not hammer it with requests
-  sleep(2)
-  unless next_page.nil?
-    scrape_people(next_page['href'])
-  end
-end
-
-def scrape_memberships()
-  memberships = ScraperWiki::select('* FROM memberships')
-  for membership in memberships
-    scrape_person(membership['term'], membership['url'])
-  end
-end
-
-def scrape_person(term, url)
-  iddiputado = url.to_s[/idDiputado=(\d+)/, 1]
-
-  unless ENV.key?('MORPH_RESCRAPE_ALL') or (ENV.key?('MORPH_RESCRAPE_TERM') and ENV['MORPH_RESCRAPE_TERM'] == term)
-    # don't scrape data we already have
-    name = ScraperWiki::select('name FROM data WHERE iddiputado is ? AND term is ?', [iddiputado, term]) rescue nil
-    unless name.nil? or name.empty?
-      #name = name[0]['name']
-      #puts "skipping #{name} for #{term}"
-      return
-    end
-  end
-  sleep(1)
-
-  # only visit URL if we are collecting the data
-  visit url
-
-  seat, group = all('div#curriculum div.texto_dip ul li div.dip_rojo').map(&:text).map(&:tidy)
-  faction, faction_id = group.match(/(.*?) \((.*?)\)/).captures.to_a.map(&:tidy) rescue nil
-
-  # sometimes the scraper doesn't find the name on the page and rather than stop scraping
-  # everything else just move on to the next person
-  begin
-    name = find('div#curriculum div.nombre_dip').text
-  rescue
-      $stderr.puts "failed to find name element for #{url}"
-    return
-  end
-
-  family_names, given_names = name.split(/,/).map(&:tidy)
-
-  if page.has_xpath?('.//div[@class="dip_rojo"][contains(.,"Fecha alta")]')
-    fecha_alta = find(:xpath, './/div[@class="dip_rojo"][contains(.,"Fecha alta")]')
-    start_date = fecha_alta.text.match(/(\d+)\/(\d+)\/(\d+)\./).captures.reverse.join("-")
-  end
-
-  if page.has_xpath?('.//div[@class="dip_rojo"][contains(.,"Causó baja")]')
-    causo_baja = find(:xpath, './/div[@class="dip_rojo"][contains(.,"Causó baja")]')
-    end_date = causo_baja.text.match(/(\d+)\/(\d+)\/(\d+)\./).captures.reverse.join("-")
-  end
-
-  dob = ''
-  email = ''
-  twitter = ''
-  facebook = ''
-  photo = ''
-  within('div.titular_historico') do
-    dob = date_of_birth(all(:xpath, 'following::div/ul/li')[0].text)
-  end
-
-  # capybara doesn't support enough xpath to do this
-  # sensibly so we have to do this the longwinded way
-  if page.has_xpath?('//div[@class="webperso_dip"]/div[@class="webperso_dip_parte"|@class="webperso_dip_imagen"]/a')
-    all(:xpath, '//div[@class="webperso_dip"]/div[@class="webperso_dip_parte"|@class="webperso_dip_imagen"]/a').each do |link|
-      href = link['href']
-      if href.match(/mailto/)
-        email = link.text.tidy
-      end
-      if href.match(/twitter.com/)
-        twitter = href.match(/twitter.com\/(.*)$/).captures[0]
-      end
-      if href.match(/facebook.com/)
-        facebook = href
-      end
-    end
-  end
-
-  all('div#datos_diputado').each do |img|
-    within(img) do
-      if img.has_xpath?('.//p[@class="logo_group"]/img[@name="foto"]')
-        photo = find(:xpath, './/p[@class="logo_group"]/img[@name="foto"]')['src'].text
-      end
-    end
-  end
-
-  data = {
-    iddiputado: iddiputado,
-    name: "#{given_names} #{family_names}",
-    sort_name: name,
-    given_name: given_names,
-    family_name: family_names,
-    gender: gender_from(seat),
-    party: find('div#datos_diputado p.nombre_grupo').text.tidy,
-    faction_id: faction_id,
-    faction: faction,
-    source: url.to_s,
-    dob: dob,
-    term: term,
-    start_date: start_date,
-    end_date: end_date,
-    email: email,
-    twitter: twitter,
-    facebook: facebook,
-    phone: all('div.texto_dip').map(&:text).join('').match(/Teléfono: (.*)$/).to_a.last.to_s.tidy,
-    fax: all('div.texto_dip').map(&:text).join('').match(/Fax: (.*)$/).to_a.last.to_s.tidy,
-    constituency: seat[/Diputad. por (.*)\./, 1],
-    photo: photo,
-  }
-  data[:photo] = URI.join(url, data[:photo]).to_s unless data[:photo].to_s.empty?
-
-  all_terms_url = find('div.soporte_year li a')['href'].match('.*listadoFichas.*').to_a.first.to_s
-
-  # it might seem a bit odd to do this only once we've worked out everything
-  # else but doing it this way means we don't need to visit the all terms page
-  # and then go back so it's one less network call per person
-  id = get_unique_id(all_terms_url, term, iddiputado, name)
-
-  # don't save things if we don't get an id
-  if id.nil?
-    #puts "no id so not saving"
-      return
-  end
-
-  data[:id] = id
-
-  #puts "%s - %s\n" % [ data[:name], data[:id] ]
-  ScraperWiki.save_sqlite([:id, :term], data)
-end
-
-# scrape_people('http://www.congreso.es/portal/page/portal/Congreso/Congreso/Diputados/DiputadosTodasLegislaturas')
-# scrape_memberships()
-
+require 'uri'
 require_relative 'lib/members_list_page'
 require_relative 'lib/member_page'
 

From 4b7fe80d42dcb8c8bb53a4f7290dc27e20280af0 Mon Sep 17 00:00:00 2001
From: Chris Mytton <chrismytton@gmail.com>
Date: Mon, 14 Nov 2016 21:05:50 +0100
Subject: [PATCH 19/20] Remove unneeded gems

Most gems we need now come as dependencies of scraped_page.
---
 Gemfile      |  6 ------
 Gemfile.lock | 45 +++------------------------------------------
 2 files changed, 3 insertions(+), 48 deletions(-)

diff --git a/Gemfile b/Gemfile
index c81e1866f..09297b106 100644
--- a/Gemfile
+++ b/Gemfile
@@ -8,11 +8,5 @@ git_source(:github) { |repo_name| "https://github.com/#{repo_name}.git" }
 ruby "2.3.1"
 
 gem "scraperwiki", github: "openaustralia/scraperwiki-ruby", branch: "morph_defaults"
-gem "nokogiri"
-gem "open-uri-cached"
 gem "pry"
-gem "colorize"
-gem "capybara"
-gem "poltergeist"
-gem 'scraped_page_archive', github: "everypolitician/scraped_page_archive", branch: "master"
 gem 'scraped_page', github: 'everypolitician/scraped_page', branch: 'master'
diff --git a/Gemfile.lock b/Gemfile.lock
index 5a267447d..045b1e02e 100644
--- a/Gemfile.lock
+++ b/Gemfile.lock
@@ -8,15 +8,6 @@ GIT
       nokogiri
       scraped_page_archive (>= 0.5)
 
-GIT
-  remote: https://github.com/everypolitician/scraped_page_archive.git
-  revision: c4056f6029f284a326e56c129766e397995e55f7
-  branch: master
-  specs:
-    scraped_page_archive (0.5.0)
-      git (~> 1.3.0)
-      vcr-archive (~> 0.3.0)
-
 GIT
   remote: https://github.com/openaustralia/scraperwiki-ruby.git
   revision: fc50176812505e463077d5c673d504a6a234aa78
@@ -31,16 +22,7 @@ GEM
   specs:
     addressable (2.5.0)
       public_suffix (~> 2.0, >= 2.0.2)
-    capybara (2.6.2)
-      addressable
-      mime-types (>= 1.16)
-      nokogiri (>= 1.3.3)
-      rack (>= 1.0.0)
-      rack-test (>= 0.5.4)
-      xpath (~> 2.0)
-    cliver (0.3.2)
     coderay (1.1.1)
-    colorize (0.7.7)
     crack (0.4.3)
       safe_yaml (~> 1.0.0)
     field_serializer (0.2.0)
@@ -48,28 +30,18 @@ GEM
     hashdiff (0.3.0)
     httpclient (2.7.1)
     method_source (0.8.2)
-    mime-types (3.0)
-      mime-types-data (~> 3.2015)
-    mime-types-data (3.2016.0221)
     mini_portile2 (2.1.0)
-    multi_json (1.11.2)
     nokogiri (1.6.8.1)
       mini_portile2 (~> 2.1.0)
-    open-uri-cached (0.0.5)
-    poltergeist (1.9.0)
-      capybara (~> 2.1)
-      cliver (~> 0.3.1)
-      multi_json (~> 1.0)
-      websocket-driver (>= 0.2.0)
     pry (0.10.3)
       coderay (~> 1.1.0)
       method_source (~> 0.8.1)
       slop (~> 3.4)
     public_suffix (2.0.4)
-    rack (1.6.4)
-    rack-test (0.6.3)
-      rack (>= 1.0)
     safe_yaml (1.0.4)
+    scraped_page_archive (0.5.0)
+      git (~> 1.3.0)
+      vcr-archive (~> 0.3.0)
     slop (3.6.0)
     sqlite3 (1.3.11)
     sqlite_magic (0.0.6)
@@ -82,24 +54,13 @@ GEM
       addressable (>= 2.3.6)
       crack (>= 0.3.2)
       hashdiff
-    websocket-driver (0.6.3)
-      websocket-extensions (>= 0.1.0)
-    websocket-extensions (0.1.2)
-    xpath (2.0.0)
-      nokogiri (~> 1.3)
 
 PLATFORMS
   ruby
 
 DEPENDENCIES
-  capybara
-  colorize
-  nokogiri
-  open-uri-cached
-  poltergeist
   pry
   scraped_page!
-  scraped_page_archive!
   scraperwiki!
 
 RUBY VERSION

From 95cad54f9caed557063bf02961ae720a49485af2 Mon Sep 17 00:00:00 2001
From: Chris Mytton <chrismytton@gmail.com>
Date: Mon, 20 Feb 2017 10:51:01 +0100
Subject: [PATCH 20/20] Add open-uri-cached

This caches a copy of fetched webpages to make the scraper quicker to
develop locally.
---
 .gitignore   | 1 +
 Gemfile      | 1 +
 Gemfile.lock | 4 +++-
 scraper.rb   | 3 +++
 4 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/.gitignore b/.gitignore
index 66d464d5a..2c7b194ce 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,2 +1,3 @@
 # Ignore output of scraper
 data.sqlite
+.cache
diff --git a/Gemfile b/Gemfile
index 09297b106..71dbac8b5 100644
--- a/Gemfile
+++ b/Gemfile
@@ -10,3 +10,4 @@ ruby "2.3.1"
 gem "scraperwiki", github: "openaustralia/scraperwiki-ruby", branch: "morph_defaults"
 gem "pry"
 gem 'scraped_page', github: 'everypolitician/scraped_page', branch: 'master'
+gem 'open-uri-cached'
diff --git a/Gemfile.lock b/Gemfile.lock
index 045b1e02e..4e948b80d 100644
--- a/Gemfile.lock
+++ b/Gemfile.lock
@@ -33,6 +33,7 @@ GEM
     mini_portile2 (2.1.0)
     nokogiri (1.6.8.1)
       mini_portile2 (~> 2.1.0)
+    open-uri-cached (0.0.5)
     pry (0.10.3)
       coderay (~> 1.1.0)
       method_source (~> 0.8.1)
@@ -59,6 +60,7 @@ PLATFORMS
   ruby
 
 DEPENDENCIES
+  open-uri-cached
   pry
   scraped_page!
   scraperwiki!
@@ -67,4 +69,4 @@ RUBY VERSION
    ruby 2.3.1p112
 
 BUNDLED WITH
-   1.13.6
+   1.13.7
diff --git a/scraper.rb b/scraper.rb
index fe1406276..bf83c4b1a 100644
--- a/scraper.rb
+++ b/scraper.rb
@@ -4,6 +4,9 @@
 require_relative 'lib/members_list_page'
 require_relative 'lib/member_page'
 
+require 'open-uri/cached'
+OpenURI::Cache.cache_path = '.cache'
+
 url = 'http://www.congreso.es/portal/page/portal/Congreso/Congreso/Diputados/DiputadosTodasLegislaturas'
 
 loop do