From 42c6c3e9d73b89561115d62cdc7b1c834dd5ddb6 Mon Sep 17 00:00:00 2001 From: Chris Mytton <chrismytton@gmail.com> Date: Tue, 8 Nov 2016 12:17:44 +0000 Subject: [PATCH 01/20] Add scraped_page dependency --- Gemfile | 1 + Gemfile.lock | 28 +++++++++++++++++++++------- 2 files changed, 22 insertions(+), 7 deletions(-) diff --git a/Gemfile b/Gemfile index ff39898ce..c81e1866f 100644 --- a/Gemfile +++ b/Gemfile @@ -15,3 +15,4 @@ gem "colorize" gem "capybara" gem "poltergeist" gem 'scraped_page_archive', github: "everypolitician/scraped_page_archive", branch: "master" +gem 'scraped_page', github: 'everypolitician/scraped_page', branch: 'master' diff --git a/Gemfile.lock b/Gemfile.lock index 6546341dc..5a267447d 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -1,9 +1,19 @@ +GIT + remote: https://github.com/everypolitician/scraped_page.git + revision: 6eb806c01e8b9f3e333575a639e40a68be60ae48 + branch: master + specs: + scraped_page (0.1.0) + field_serializer + nokogiri + scraped_page_archive (>= 0.5) + GIT remote: https://github.com/everypolitician/scraped_page_archive.git - revision: 9d8a6347d122d42a983ba28ff84d24de6bdca8a0 + revision: c4056f6029f284a326e56c129766e397995e55f7 branch: master specs: - scraped_page_archive (0.4.1) + scraped_page_archive (0.5.0) git (~> 1.3.0) vcr-archive (~> 0.3.0) @@ -19,7 +29,8 @@ GIT GEM remote: https://rubygems.org/ specs: - addressable (2.4.0) + addressable (2.5.0) + public_suffix (~> 2.0, >= 2.0.2) capybara (2.6.2) addressable mime-types (>= 1.16) @@ -32,6 +43,7 @@ GEM colorize (0.7.7) crack (0.4.3) safe_yaml (~> 1.0.0) + field_serializer (0.2.0) git (1.3.0) hashdiff (0.3.0) httpclient (2.7.1) @@ -39,10 +51,10 @@ GEM mime-types (3.0) mime-types-data (~> 3.2015) mime-types-data (3.2016.0221) - mini_portile2 (2.0.0) + mini_portile2 (2.1.0) multi_json (1.11.2) - nokogiri (1.6.7.2) - mini_portile2 (~> 2.0.0.rc2) + nokogiri (1.6.8.1) + mini_portile2 (~> 2.1.0) open-uri-cached (0.0.5) poltergeist (1.9.0) capybara (~> 2.1) @@ -53,6 +65,7 @@ GEM coderay (~> 1.1.0) method_source (~> 0.8.1) slop (~> 3.4) + public_suffix (2.0.4) rack (1.6.4) rack-test (0.6.3) rack (>= 1.0) @@ -85,6 +98,7 @@ DEPENDENCIES open-uri-cached poltergeist pry + scraped_page! scraped_page_archive! scraperwiki! @@ -92,4 +106,4 @@ RUBY VERSION ruby 2.3.1p112 BUNDLED WITH - 1.13.5 + 1.13.6 From a2da5d1adc9fecd919538acfc62ceea95ae89385 Mon Sep 17 00:00:00 2001 From: Chris Mytton <chrismytton@gmail.com> Date: Tue, 8 Nov 2016 14:27:15 +0000 Subject: [PATCH 02/20] WIP: Refactor scraper to use ScrapedPage classes --- lib/member_page.rb | 73 ++++++++++++++++++++++++++++++++++++++++ lib/members_list_page.rb | 11 ++++++ scraper.rb | 14 ++++++-- 3 files changed, 96 insertions(+), 2 deletions(-) create mode 100644 lib/member_page.rb create mode 100644 lib/members_list_page.rb diff --git a/lib/member_page.rb b/lib/member_page.rb new file mode 100644 index 000000000..a2e052716 --- /dev/null +++ b/lib/member_page.rb @@ -0,0 +1,73 @@ +require 'scraped_page' + +class String + def tidy + self.gsub(/[[:space:]]+/, ' ').strip + end +end + +class MemberPage < ScrapedPage + # Remove session information from url + def url + super.to_s.match(/(.*)_piref[\d_]+\.(next_page.*)/).captures.join('') + end + + field :iddiputado do + query['idDiputado'] + end + + field :term do + query['idLegislatura'] + end + + field :name do + noko.css('div#curriculum div.nombre_dip').text + end + + field :family_names do + name.split(/,/).first.tidy + end + + field :given_names do + name.split(/,/).last.tidy + end + + field :gender do + return 'female' if seat.include? 'Diputada' + return 'male' if seat.include? 'Diputado' + end + + field :source do + url.to_s + end + + field :dob do + # TODO: Make a class to encapsulate parsing the DOB. + end + + field :faction do + faction_information[:faction].tidy + end + + field :faction_id do + faction_information[:faction_id].tidy + end + + private + + def seat + @seat ||= noko.at_css('div#curriculum div.texto_dip ul li div.dip_rojo:first').text.tidy + end + + def group + @group ||= noko.at_css('div#curriculum div.texto_dip ul li div.dip_rojo:last').text.tidy + end + + def query + @query ||= URI.decode_www_form(URI.parse(url).query).to_h + end + + def faction_information + @faction_information ||= group.match(/(?<faction>.*?) \((?<faction_id>.*?)\)/) + end +end diff --git a/lib/members_list_page.rb b/lib/members_list_page.rb new file mode 100644 index 000000000..29c26c53b --- /dev/null +++ b/lib/members_list_page.rb @@ -0,0 +1,11 @@ +require 'scraped_page' + +class MembersListPage < ScrapedPage + def member_urls + @member_urls ||= noko.css('div#RESULTADOS_DIPUTADOS div.listado_1 ul li a').map { |p| p[:href] } + end + + def next_page_url + @next_page_url ||= noko.css('//div[@class = "paginacion"]//a[contains("Página Siguiente")]').first[:href] + end +end diff --git a/scraper.rb b/scraper.rb index f30fe3e47..c04e8513b 100644 --- a/scraper.rb +++ b/scraper.rb @@ -286,5 +286,15 @@ def scrape_person(term, url) ScraperWiki.save_sqlite([:id, :term], data) end -scrape_people('http://www.congreso.es/portal/page/portal/Congreso/Congreso/Diputados/DiputadosTodasLegislaturas') -scrape_memberships() +# scrape_people('http://www.congreso.es/portal/page/portal/Congreso/Congreso/Diputados/DiputadosTodasLegislaturas') +# scrape_memberships() + +require_relative 'lib/members_list_page' +require_relative 'lib/member_page' + +start_url = 'http://www.congreso.es/portal/page/portal/Congreso/Congreso/Diputados/DiputadosTodasLegislaturas' + +MembersListPage.new(url: start_url).member_urls.each do |member_url| + member = MemberPage.new(url: URI.join(start_url, member_url)) + ScraperWiki.save_sqlite([:name, :term], member.to_h) +end From 454a4a5fd6aef83b77393e9b8a5deb29508947d3 Mon Sep 17 00:00:00 2001 From: Chris Mytton <chrismytton@gmail.com> Date: Mon, 14 Nov 2016 13:30:13 +0100 Subject: [PATCH 03/20] Add class to represent members' DOB --- lib/member_page.rb | 29 ++++++++++++++++++++++++++++- 1 file changed, 28 insertions(+), 1 deletion(-) diff --git a/lib/member_page.rb b/lib/member_page.rb index a2e052716..f603e7239 100644 --- a/lib/member_page.rb +++ b/lib/member_page.rb @@ -7,6 +7,31 @@ def tidy end class MemberPage < ScrapedPage + class DateOfBirth + DATE_REGEX = /(?<day>\d+) de (?<month>[^[:space:]]*) de (?<year>\d+)/ + + def initialize(date_string) + @date_string = date_string + end + + def to_s + return if match.nil? + "%d-%02d-%02d" % [ match[:year], month(match[:month]), match[:day] ] + end + + private + + attr_reader :date_string + + def match + @match ||= date_string.match(DATE_REGEX) + end + + def month(str) + ['','enero','febrero','marzo','abril','mayo','junio','julio','agosto','septiembre','octubre','noviembre','diciembre'].find_index(str.downcase) or raise "Unknown month #{str}".magenta + end + end + # Remove session information from url def url super.to_s.match(/(.*)_piref[\d_]+\.(next_page.*)/).captures.join('') @@ -42,7 +67,9 @@ def url end field :dob do - # TODO: Make a class to encapsulate parsing the DOB. + DateOfBirth.new( + noko.xpath('.//div[@class="titular_historico"]/following::div/ul/li').first.text + ).to_s end field :faction do From 44001c8f90e8f4099e4c127dca50db49a039e231 Mon Sep 17 00:00:00 2001 From: Chris Mytton <chrismytton@gmail.com> Date: Mon, 14 Nov 2016 13:30:30 +0100 Subject: [PATCH 04/20] Add party field --- lib/member_page.rb | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/lib/member_page.rb b/lib/member_page.rb index f603e7239..cd5035dc7 100644 --- a/lib/member_page.rb +++ b/lib/member_page.rb @@ -62,6 +62,10 @@ def url return 'male' if seat.include? 'Diputado' end + field :party do + noko.at_css('#datos_diputado .nombre_grupo').text.tidy + end + field :source do url.to_s end From e08861de54d3f60ecceda09f97fb8688775f122c Mon Sep 17 00:00:00 2001 From: Chris Mytton <chrismytton@gmail.com> Date: Mon, 14 Nov 2016 16:44:17 +0100 Subject: [PATCH 05/20] Add remaining methods to MemberPage class These pick up the other fields that the existing scraper was getting. --- lib/member_page.rb | 44 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 44 insertions(+) diff --git a/lib/member_page.rb b/lib/member_page.rb index cd5035dc7..efc437517 100644 --- a/lib/member_page.rb +++ b/lib/member_page.rb @@ -84,6 +84,50 @@ def url faction_information[:faction_id].tidy end + field :start_date do + start_date = noko.xpath('.//div[@class="dip_rojo"][contains(.,"Fecha alta")]') + .text.match(/(\d+)\/(\d+)\/(\d+)\./) + return if start_date.nil? + start_date.captures.reverse.join('-') + end + + field :end_date do + end_date = noko.xpath('.//div[@class="dip_rojo"][contains(.,"Causó baja")]') + .text.match(/(\d+)\/(\d+)\/(\d+)\./) + return if end_date.nil? + end_date.captures.reverse.join('-') + end + + field :email do + noko.css('.webperso_dip a[href*="mailto"]').text.tidy + end + + field :twitter do + noko.css('.webperso_dip a[href*="twitter.com"]').text.tidy + end + + field :facebook do + noko.css('.webperso_dip a[href*="facebook.com"]').text.tidy + end + + field :phone do + noko.css('.texto_dip').map(&:text).join('').match(/Teléfono: (.*)$/).to_a.last.to_s.tidy + end + + field :fax do + noko.css('.texto_dip').map(&:text).join('').match(/Fax: (.*)$/).to_a.last.to_s.tidy + end + + field :constituency do + seat[/Diputad. por (.*)\./, 1] + end + + field :photo do + foto = noko.at_css('#datos_diputado img[name="foto"]') + return if foto.nil? + URI.join(url, foto[:src]).to_s + end + private def seat From a36bbcc77cb4bb28fcbfb8fa5e90e7b5a3b2d4a5 Mon Sep 17 00:00:00 2001 From: Chris Mytton <chrismytton@gmail.com> Date: Mon, 14 Nov 2016 17:31:12 +0100 Subject: [PATCH 06/20] Move DateOfBirth class into its own file --- lib/date_of_birth.rb | 25 +++++++++++++++++++++++++ lib/member_page.rb | 26 +------------------------- 2 files changed, 26 insertions(+), 25 deletions(-) create mode 100644 lib/date_of_birth.rb diff --git a/lib/date_of_birth.rb b/lib/date_of_birth.rb new file mode 100644 index 000000000..9f922073f --- /dev/null +++ b/lib/date_of_birth.rb @@ -0,0 +1,25 @@ +# frozen_string_literal: true +class DateOfBirth + DATE_REGEX = /(?<day>\d+) de (?<month>[^[:space:]]*) de (?<year>\d+)/ + + def initialize(date_string) + @date_string = date_string + end + + def to_s + return if match.nil? + "%d-%02d-%02d" % [ match[:year], month(match[:month]), match[:day] ] + end + + private + + attr_reader :date_string + + def match + @match ||= date_string.match(DATE_REGEX) + end + + def month(str) + ['', 'enero', 'febrero', 'marzo', 'abril', 'mayo', 'junio', 'julio', 'agosto', 'septiembre', 'octubre', 'noviembre', 'diciembre'].find_index(str.downcase) || raise("Unknown month #{str}".magenta) + end +end diff --git a/lib/member_page.rb b/lib/member_page.rb index efc437517..b6860e7ae 100644 --- a/lib/member_page.rb +++ b/lib/member_page.rb @@ -1,4 +1,5 @@ require 'scraped_page' +require_relative 'date_of_birth' class String def tidy @@ -7,31 +8,6 @@ def tidy end class MemberPage < ScrapedPage - class DateOfBirth - DATE_REGEX = /(?<day>\d+) de (?<month>[^[:space:]]*) de (?<year>\d+)/ - - def initialize(date_string) - @date_string = date_string - end - - def to_s - return if match.nil? - "%d-%02d-%02d" % [ match[:year], month(match[:month]), match[:day] ] - end - - private - - attr_reader :date_string - - def match - @match ||= date_string.match(DATE_REGEX) - end - - def month(str) - ['','enero','febrero','marzo','abril','mayo','junio','julio','agosto','septiembre','octubre','noviembre','diciembre'].find_index(str.downcase) or raise "Unknown month #{str}".magenta - end - end - # Remove session information from url def url super.to_s.match(/(.*)_piref[\d_]+\.(next_page.*)/).captures.join('') From 7c1c583e47b6333e44892399d9c3bfa589e7dd21 Mon Sep 17 00:00:00 2001 From: Chris Mytton <chrismytton@gmail.com> Date: Mon, 14 Nov 2016 17:33:57 +0100 Subject: [PATCH 07/20] Handle missing faction information --- lib/member_page.rb | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/lib/member_page.rb b/lib/member_page.rb index b6860e7ae..924a7d0c8 100644 --- a/lib/member_page.rb +++ b/lib/member_page.rb @@ -53,11 +53,11 @@ def url end field :faction do - faction_information[:faction].tidy + faction_information[:faction].to_s.tidy end field :faction_id do - faction_information[:faction_id].tidy + faction_information[:faction_id].to_s.tidy end field :start_date do @@ -119,6 +119,6 @@ def query end def faction_information - @faction_information ||= group.match(/(?<faction>.*?) \((?<faction_id>.*?)\)/) + @faction_information ||= group.match(/(?<faction>.*?) \((?<faction_id>.*?)\)/) || {} end end From 14bea9d35b509fcc8121cd0a7f8f154e6595e092 Mon Sep 17 00:00:00 2001 From: Chris Mytton <chrismytton@gmail.com> Date: Mon, 14 Nov 2016 17:35:53 +0100 Subject: [PATCH 08/20] More robust url session handling Use the URI module to make it easier to just replace the unwanted part of the query string. --- lib/member_page.rb | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/lib/member_page.rb b/lib/member_page.rb index 924a7d0c8..a62689897 100644 --- a/lib/member_page.rb +++ b/lib/member_page.rb @@ -10,7 +10,10 @@ def tidy class MemberPage < ScrapedPage # Remove session information from url def url - super.to_s.match(/(.*)_piref[\d_]+\.(next_page.*)/).captures.join('') + uri = URI.parse(super.to_s) + return uri.to_s unless uri.query + uri.query = uri.query.gsub(/_piref[\d_]+\./, '') + uri.to_s end field :iddiputado do From d48f099c9617ec4af6dc9763ce5406825859eea9 Mon Sep 17 00:00:00 2001 From: Chris Mytton <chrismytton@gmail.com> Date: Mon, 14 Nov 2016 17:39:01 +0100 Subject: [PATCH 09/20] Extract a SpanishCongressPage base class This means that rather than having to override the url in every class in the system we can just do it once here and then inherit from this class. --- lib/member_page.rb | 12 ++---------- lib/spanish_congress_page.rb | 12 ++++++++++++ 2 files changed, 14 insertions(+), 10 deletions(-) create mode 100644 lib/spanish_congress_page.rb diff --git a/lib/member_page.rb b/lib/member_page.rb index a62689897..2430eb08e 100644 --- a/lib/member_page.rb +++ b/lib/member_page.rb @@ -1,4 +1,4 @@ -require 'scraped_page' +require_relative 'spanish_congress_page' require_relative 'date_of_birth' class String @@ -7,15 +7,7 @@ def tidy end end -class MemberPage < ScrapedPage - # Remove session information from url - def url - uri = URI.parse(super.to_s) - return uri.to_s unless uri.query - uri.query = uri.query.gsub(/_piref[\d_]+\./, '') - uri.to_s - end - +class MemberPage < SpanishCongressPage field :iddiputado do query['idDiputado'] end diff --git a/lib/spanish_congress_page.rb b/lib/spanish_congress_page.rb new file mode 100644 index 000000000..a71aa069d --- /dev/null +++ b/lib/spanish_congress_page.rb @@ -0,0 +1,12 @@ +require 'scraped_page' +require 'uri' + +class SpanishCongressPage < ScrapedPage + # Remove session information from url + def url + uri = URI.parse(super.to_s) + return uri.to_s unless uri.query + uri.query = uri.query.gsub(/_piref[\d_]+\./, '') + uri.to_s + end +end From 5485aae9496475c4e383fb8194a609aeba082925 Mon Sep 17 00:00:00 2001 From: Chris Mytton <chrismytton@gmail.com> Date: Mon, 14 Nov 2016 17:40:21 +0100 Subject: [PATCH 10/20] Move String#tidy into its own file This should become part of ScapedPage RSN. --- lib/core_ext.rb | 6 ++++++ lib/member_page.rb | 7 +------ 2 files changed, 7 insertions(+), 6 deletions(-) create mode 100644 lib/core_ext.rb diff --git a/lib/core_ext.rb b/lib/core_ext.rb new file mode 100644 index 000000000..d701f70d4 --- /dev/null +++ b/lib/core_ext.rb @@ -0,0 +1,6 @@ +# frozen_string_literal: true +class String + def tidy + gsub(/[[:space:]]+/, ' ').strip + end +end diff --git a/lib/member_page.rb b/lib/member_page.rb index 2430eb08e..b4b3a88ef 100644 --- a/lib/member_page.rb +++ b/lib/member_page.rb @@ -1,11 +1,6 @@ require_relative 'spanish_congress_page' require_relative 'date_of_birth' - -class String - def tidy - self.gsub(/[[:space:]]+/, ' ').strip - end -end +require_relative 'core_ext' class MemberPage < SpanishCongressPage field :iddiputado do From b3b1096246bc7923c5520bcb976be28eda914966 Mon Sep 17 00:00:00 2001 From: Chris Mytton <chrismytton@gmail.com> Date: Mon, 14 Nov 2016 17:41:12 +0100 Subject: [PATCH 11/20] Make MembersListPage inherit from SpanishCongressPage This means these pages also have the session information stripped from the url. --- lib/members_list_page.rb | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/members_list_page.rb b/lib/members_list_page.rb index 29c26c53b..c68ce99c0 100644 --- a/lib/members_list_page.rb +++ b/lib/members_list_page.rb @@ -1,6 +1,6 @@ -require 'scraped_page' +require_relative 'spanish_congress_page' -class MembersListPage < ScrapedPage +class MembersListPage < SpanishCongressPage def member_urls @member_urls ||= noko.css('div#RESULTADOS_DIPUTADOS div.listado_1 ul li a').map { |p| p[:href] } end From c7163ee817df920c7567b5fa2e6ffe4bb322253d Mon Sep 17 00:00:00 2001 From: Chris Mytton <chrismytton@gmail.com> Date: Mon, 14 Nov 2016 17:41:40 +0100 Subject: [PATCH 12/20] Handle pagination in scraper.rb Rather than just scraping the first page, scrape all pages. --- scraper.rb | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/scraper.rb b/scraper.rb index c04e8513b..eb9933ea8 100644 --- a/scraper.rb +++ b/scraper.rb @@ -292,9 +292,14 @@ def scrape_person(term, url) require_relative 'lib/members_list_page' require_relative 'lib/member_page' -start_url = 'http://www.congreso.es/portal/page/portal/Congreso/Congreso/Diputados/DiputadosTodasLegislaturas' +url = 'http://www.congreso.es/portal/page/portal/Congreso/Congreso/Diputados/DiputadosTodasLegislaturas' -MembersListPage.new(url: start_url).member_urls.each do |member_url| - member = MemberPage.new(url: URI.join(start_url, member_url)) - ScraperWiki.save_sqlite([:name, :term], member.to_h) +loop do + page = MembersListPage.new(url: url) + page.member_urls.each do |member_url| + member = MemberPage.new(url: URI.join(url, member_url)) + ScraperWiki.save_sqlite([:name, :term], member.to_h) + end + url = page.next_page_url + break if url.nil? end From 0e158b982524fcfe4be6e9a2034a5a90e1e6fa1c Mon Sep 17 00:00:00 2001 From: Chris Mytton <chrismytton@gmail.com> Date: Mon, 14 Nov 2016 20:18:07 +0100 Subject: [PATCH 13/20] Convert name parts to string before tidying --- lib/member_page.rb | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/member_page.rb b/lib/member_page.rb index b4b3a88ef..9b0e16300 100644 --- a/lib/member_page.rb +++ b/lib/member_page.rb @@ -16,11 +16,11 @@ class MemberPage < SpanishCongressPage end field :family_names do - name.split(/,/).first.tidy + name.split(/,/).first.to_s.tidy end field :given_names do - name.split(/,/).last.tidy + name.split(/,/).last.to_s.tidy end field :gender do From cddfd484e4be37de5e2454fd6dc2591e97d2d161 Mon Sep 17 00:00:00 2001 From: Chris Mytton <chrismytton@gmail.com> Date: Mon, 14 Nov 2016 20:30:43 +0100 Subject: [PATCH 14/20] Correctly handle the last page of results On the last page there is no next link, so trying to call `next[:href]` blows up. Instead I've moved `next_page_link` to a separate method then checking that exists in `next_page_url`. --- lib/members_list_page.rb | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/lib/members_list_page.rb b/lib/members_list_page.rb index c68ce99c0..9f68a3c8f 100644 --- a/lib/members_list_page.rb +++ b/lib/members_list_page.rb @@ -6,6 +6,10 @@ def member_urls end def next_page_url - @next_page_url ||= noko.css('//div[@class = "paginacion"]//a[contains("Página Siguiente")]').first[:href] + next_page_link && next_page_link[:href] + end + + def next_page_link + @next_page_url ||= noko.css('//div[@class = "paginacion"]//a[contains("Página Siguiente")]').first end end From 80b4a12e4f2adfefded7f06d551cbe748c07ce5f Mon Sep 17 00:00:00 2001 From: Chris Mytton <chrismytton@gmail.com> Date: Mon, 14 Nov 2016 20:44:46 +0100 Subject: [PATCH 15/20] Simplify phone and fax fields --- lib/member_page.rb | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/member_page.rb b/lib/member_page.rb index 9b0e16300..a657dd7e4 100644 --- a/lib/member_page.rb +++ b/lib/member_page.rb @@ -77,11 +77,11 @@ class MemberPage < SpanishCongressPage end field :phone do - noko.css('.texto_dip').map(&:text).join('').match(/Teléfono: (.*)$/).to_a.last.to_s.tidy + noko.css('.texto_dip').text.match(/Teléfono: (.*)$/).to_a.last.to_s.tidy end field :fax do - noko.css('.texto_dip').map(&:text).join('').match(/Fax: (.*)$/).to_a.last.to_s.tidy + noko.css('.texto_dip').text.match(/Fax: (.*)$/).to_a.last.to_s.tidy end field :constituency do From c25c4b0ebbcb2396b12ba3f4fd281e3aff42c138 Mon Sep 17 00:00:00 2001 From: Chris Mytton <chrismytton@gmail.com> Date: Mon, 14 Nov 2016 20:45:22 +0100 Subject: [PATCH 16/20] DateOfBirth#to_s should always return a string --- lib/date_of_birth.rb | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/date_of_birth.rb b/lib/date_of_birth.rb index 9f922073f..0e1adcd01 100644 --- a/lib/date_of_birth.rb +++ b/lib/date_of_birth.rb @@ -7,8 +7,8 @@ def initialize(date_string) end def to_s - return if match.nil? - "%d-%02d-%02d" % [ match[:year], month(match[:month]), match[:day] ] + return '' if match.nil? + '%d-%02d-%02d' % [match[:year], month(match[:month]), match[:day]] end private From 4fb737a6d43d535fa7349d5c9a83c4b0164727cd Mon Sep 17 00:00:00 2001 From: Chris Mytton <chrismytton@gmail.com> Date: Mon, 14 Nov 2016 21:02:47 +0100 Subject: [PATCH 17/20] Run rubocop -a on lib --- lib/member_page.rb | 5 +++-- lib/members_list_page.rb | 1 + lib/spanish_congress_page.rb | 1 + 3 files changed, 5 insertions(+), 2 deletions(-) diff --git a/lib/member_page.rb b/lib/member_page.rb index a657dd7e4..278f9f0a2 100644 --- a/lib/member_page.rb +++ b/lib/member_page.rb @@ -1,3 +1,4 @@ +# frozen_string_literal: true require_relative 'spanish_congress_page' require_relative 'date_of_birth' require_relative 'core_ext' @@ -52,14 +53,14 @@ class MemberPage < SpanishCongressPage field :start_date do start_date = noko.xpath('.//div[@class="dip_rojo"][contains(.,"Fecha alta")]') - .text.match(/(\d+)\/(\d+)\/(\d+)\./) + .text.match(/(\d+)\/(\d+)\/(\d+)\./) return if start_date.nil? start_date.captures.reverse.join('-') end field :end_date do end_date = noko.xpath('.//div[@class="dip_rojo"][contains(.,"Causó baja")]') - .text.match(/(\d+)\/(\d+)\/(\d+)\./) + .text.match(/(\d+)\/(\d+)\/(\d+)\./) return if end_date.nil? end_date.captures.reverse.join('-') end diff --git a/lib/members_list_page.rb b/lib/members_list_page.rb index 9f68a3c8f..5597d4bde 100644 --- a/lib/members_list_page.rb +++ b/lib/members_list_page.rb @@ -1,3 +1,4 @@ +# frozen_string_literal: true require_relative 'spanish_congress_page' class MembersListPage < SpanishCongressPage diff --git a/lib/spanish_congress_page.rb b/lib/spanish_congress_page.rb index a71aa069d..d9965fb52 100644 --- a/lib/spanish_congress_page.rb +++ b/lib/spanish_congress_page.rb @@ -1,3 +1,4 @@ +# frozen_string_literal: true require 'scraped_page' require 'uri' From b1c18846f2d2b22aa66e9552107007ae432a852a Mon Sep 17 00:00:00 2001 From: Chris Mytton <chrismytton@gmail.com> Date: Mon, 14 Nov 2016 21:05:32 +0100 Subject: [PATCH 18/20] Delete unused code in scraper.rb --- scraper.rb | 292 +---------------------------------------------------- 1 file changed, 2 insertions(+), 290 deletions(-) diff --git a/scraper.rb b/scraper.rb index eb9933ea8..fe1406276 100644 --- a/scraper.rb +++ b/scraper.rb @@ -1,294 +1,6 @@ -#!/bin/env ruby -# encoding: utf-8 - +# frozen_string_literal: true require 'scraperwiki' -require 'capybara' -require 'capybara/dsl' -require 'capybara/poltergeist' -require 'pry' -require 'scraped_page_archive/capybara' - -Capybara.default_max_wait_time = 5 - - -# images are very slow to load and cause timeouts and -# as we don't need them skip -# Also, some pages have JS errors which we don't care about -options = { - js_errors: false, - timeout: 60, - phantomjs_options: ['--load-images=no'] -} - -Capybara.register_driver :poltergeist do |app| - Capybara::Poltergeist::Driver.new(app, options) -end - -include Capybara::DSL -Capybara.default_driver = :poltergeist - - -class String - def tidy - self.gsub(/[[:space:]]+/, ' ').strip - end -end - -def month(str) - ['','enero','febrero','marzo','abril','mayo','junio','julio','agosto','septiembre','octubre','noviembre','diciembre'].find_index(str) or raise "Unknown month #{str}".magenta -end - -def date_of_birth(str) - matched = str.match(/(\d+) de ([^[:space:]]*) de (\d+)/) or return - day, month, year = matched.captures - "%d-%02d-%02d" % [ year, month(month), day ] -end - -def gender_from(seat) - return 'female' if seat.include? 'Diputada' - return 'male' if seat.include? 'Diputado' - return -end - -def save_membership_from_url(name, url) - iddiputado = url.to_s.match(/idDiputado=(\d+)/).captures[0] - term = url.to_s.match(/idLegislatura=(\d+)/).captures[0] - # strip out session id - url = url.match(/(.*)_piref[\d_]+\.(next_page.*)/).captures.join('') - - # we can set this to rescrape everything if required - unless ENV.key?('MORPH_RESCRAPE_ALL') - # don't save data again - cur_name = ScraperWiki::select('name FROM memberships WHERE iddiputado is ? AND term is ?', [iddiputado, term]) rescue nil - unless cur_name.nil? or cur_name.empty? - return - end - end - - person = { - id: 0, - name: name.tidy, - term: term, - iddiputado: iddiputado, - url: url - } - - ScraperWiki.save_sqlite([:term, :iddiputado], person, 'memberships') -end - -# use the first term they were elected in and the id from that term as the unique id -# although for people with only one term the page in question seems to fall over so -# fall back to the current term and id for those people as it's presumably their first -def get_unique_id(url, page_term, page_iddiputado, name) - cur_id = ScraperWiki::select('id FROM memberships WHERE iddiputado is ? AND term is ? and id <> 0', [page_iddiputado, page_term]) rescue nil - unless cur_id.nil? or cur_id.empty? - return cur_id[0][:id] - end - sleep(1) - - visit url - - term_map = {} - all('div.all_leg').each do |legislature| - within(legislature) do - term = nil - if legislature.has_css?('div.btn_ficha a') - link = find('div.btn_ficha a') - href = link['href'] - # we can't do this as one operation as they don't always appear - # in the same order :( - term = href.to_s.match(/idLegislatura=(\d+)/).captures[0] - id = href.to_s.match(/idDiputado=(\d+)/).captures[0] - term_map[term.to_i] = id - save_membership_from_url(name, href) - end - if not term.nil? and legislature.has_css?('div.principal') - term_div = find('div.principal') - name, start_year, end_year = term_div.text.match(/(\w+\s*\w+)\s*\(\s*(\d+)\s*-\s*([^)]*)\)/).captures - if end_year.tidy == 'Actualidad' - end_year = '' - end - exists = ScraperWiki::select('id FROM terms WHERE id is ??', [id]) rescue nil - if exists.nil? - term = { - id: term, - name: name.tidy, - start_date: start_year.tidy, - end_date: end_year.tidy, - source: 'http://www.congreso.es/', - } - ScraperWiki.save_sqlite([:id], term, 'terms') - end - end - end - end - - # the all terms page seems to be very unreliable so if we can't find what we expect - # then we should quite rather than trying to make up an incorrect ID - if term_map.empty? - return nil - end - - min_term = term_map.keys.min - - id = "#{min_term}_#{term_map[min_term]}" - for term in term_map.keys - ScraperWiki.sqliteexecute('update memberships set id = ? where id = 0 and term = ? and iddiputado = ?', [id, term, term_map[term]]) - end - return id -end - -def scrape_people(url) - visit url - - all('div#RESULTADOS_DIPUTADOS div.listado_1 ul li a').each do |link| - save_membership_from_url(link.text, link['href']) - end - - pagination = all('div.paginacion').first - next_page = nil - if pagination.has_xpath?(".//a[contains(.,'Página Siguiente')]") - within (pagination) do - next_page = find(:xpath, ".//a[contains(.,'Página Siguiente')]") - end - end - - # the website is a bit fragile to lets not hammer it with requests - sleep(2) - unless next_page.nil? - scrape_people(next_page['href']) - end -end - -def scrape_memberships() - memberships = ScraperWiki::select('* FROM memberships') - for membership in memberships - scrape_person(membership['term'], membership['url']) - end -end - -def scrape_person(term, url) - iddiputado = url.to_s[/idDiputado=(\d+)/, 1] - - unless ENV.key?('MORPH_RESCRAPE_ALL') or (ENV.key?('MORPH_RESCRAPE_TERM') and ENV['MORPH_RESCRAPE_TERM'] == term) - # don't scrape data we already have - name = ScraperWiki::select('name FROM data WHERE iddiputado is ? AND term is ?', [iddiputado, term]) rescue nil - unless name.nil? or name.empty? - #name = name[0]['name'] - #puts "skipping #{name} for #{term}" - return - end - end - sleep(1) - - # only visit URL if we are collecting the data - visit url - - seat, group = all('div#curriculum div.texto_dip ul li div.dip_rojo').map(&:text).map(&:tidy) - faction, faction_id = group.match(/(.*?) \((.*?)\)/).captures.to_a.map(&:tidy) rescue nil - - # sometimes the scraper doesn't find the name on the page and rather than stop scraping - # everything else just move on to the next person - begin - name = find('div#curriculum div.nombre_dip').text - rescue - $stderr.puts "failed to find name element for #{url}" - return - end - - family_names, given_names = name.split(/,/).map(&:tidy) - - if page.has_xpath?('.//div[@class="dip_rojo"][contains(.,"Fecha alta")]') - fecha_alta = find(:xpath, './/div[@class="dip_rojo"][contains(.,"Fecha alta")]') - start_date = fecha_alta.text.match(/(\d+)\/(\d+)\/(\d+)\./).captures.reverse.join("-") - end - - if page.has_xpath?('.//div[@class="dip_rojo"][contains(.,"Causó baja")]') - causo_baja = find(:xpath, './/div[@class="dip_rojo"][contains(.,"Causó baja")]') - end_date = causo_baja.text.match(/(\d+)\/(\d+)\/(\d+)\./).captures.reverse.join("-") - end - - dob = '' - email = '' - twitter = '' - facebook = '' - photo = '' - within('div.titular_historico') do - dob = date_of_birth(all(:xpath, 'following::div/ul/li')[0].text) - end - - # capybara doesn't support enough xpath to do this - # sensibly so we have to do this the longwinded way - if page.has_xpath?('//div[@class="webperso_dip"]/div[@class="webperso_dip_parte"|@class="webperso_dip_imagen"]/a') - all(:xpath, '//div[@class="webperso_dip"]/div[@class="webperso_dip_parte"|@class="webperso_dip_imagen"]/a').each do |link| - href = link['href'] - if href.match(/mailto/) - email = link.text.tidy - end - if href.match(/twitter.com/) - twitter = href.match(/twitter.com\/(.*)$/).captures[0] - end - if href.match(/facebook.com/) - facebook = href - end - end - end - - all('div#datos_diputado').each do |img| - within(img) do - if img.has_xpath?('.//p[@class="logo_group"]/img[@name="foto"]') - photo = find(:xpath, './/p[@class="logo_group"]/img[@name="foto"]')['src'].text - end - end - end - - data = { - iddiputado: iddiputado, - name: "#{given_names} #{family_names}", - sort_name: name, - given_name: given_names, - family_name: family_names, - gender: gender_from(seat), - party: find('div#datos_diputado p.nombre_grupo').text.tidy, - faction_id: faction_id, - faction: faction, - source: url.to_s, - dob: dob, - term: term, - start_date: start_date, - end_date: end_date, - email: email, - twitter: twitter, - facebook: facebook, - phone: all('div.texto_dip').map(&:text).join('').match(/Teléfono: (.*)$/).to_a.last.to_s.tidy, - fax: all('div.texto_dip').map(&:text).join('').match(/Fax: (.*)$/).to_a.last.to_s.tidy, - constituency: seat[/Diputad. por (.*)\./, 1], - photo: photo, - } - data[:photo] = URI.join(url, data[:photo]).to_s unless data[:photo].to_s.empty? - - all_terms_url = find('div.soporte_year li a')['href'].match('.*listadoFichas.*').to_a.first.to_s - - # it might seem a bit odd to do this only once we've worked out everything - # else but doing it this way means we don't need to visit the all terms page - # and then go back so it's one less network call per person - id = get_unique_id(all_terms_url, term, iddiputado, name) - - # don't save things if we don't get an id - if id.nil? - #puts "no id so not saving" - return - end - - data[:id] = id - - #puts "%s - %s\n" % [ data[:name], data[:id] ] - ScraperWiki.save_sqlite([:id, :term], data) -end - -# scrape_people('http://www.congreso.es/portal/page/portal/Congreso/Congreso/Diputados/DiputadosTodasLegislaturas') -# scrape_memberships() - +require 'uri' require_relative 'lib/members_list_page' require_relative 'lib/member_page' From 4b7fe80d42dcb8c8bb53a4f7290dc27e20280af0 Mon Sep 17 00:00:00 2001 From: Chris Mytton <chrismytton@gmail.com> Date: Mon, 14 Nov 2016 21:05:50 +0100 Subject: [PATCH 19/20] Remove unneeded gems Most gems we need now come as dependencies of scraped_page. --- Gemfile | 6 ------ Gemfile.lock | 45 +++------------------------------------------ 2 files changed, 3 insertions(+), 48 deletions(-) diff --git a/Gemfile b/Gemfile index c81e1866f..09297b106 100644 --- a/Gemfile +++ b/Gemfile @@ -8,11 +8,5 @@ git_source(:github) { |repo_name| "https://github.com/#{repo_name}.git" } ruby "2.3.1" gem "scraperwiki", github: "openaustralia/scraperwiki-ruby", branch: "morph_defaults" -gem "nokogiri" -gem "open-uri-cached" gem "pry" -gem "colorize" -gem "capybara" -gem "poltergeist" -gem 'scraped_page_archive', github: "everypolitician/scraped_page_archive", branch: "master" gem 'scraped_page', github: 'everypolitician/scraped_page', branch: 'master' diff --git a/Gemfile.lock b/Gemfile.lock index 5a267447d..045b1e02e 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -8,15 +8,6 @@ GIT nokogiri scraped_page_archive (>= 0.5) -GIT - remote: https://github.com/everypolitician/scraped_page_archive.git - revision: c4056f6029f284a326e56c129766e397995e55f7 - branch: master - specs: - scraped_page_archive (0.5.0) - git (~> 1.3.0) - vcr-archive (~> 0.3.0) - GIT remote: https://github.com/openaustralia/scraperwiki-ruby.git revision: fc50176812505e463077d5c673d504a6a234aa78 @@ -31,16 +22,7 @@ GEM specs: addressable (2.5.0) public_suffix (~> 2.0, >= 2.0.2) - capybara (2.6.2) - addressable - mime-types (>= 1.16) - nokogiri (>= 1.3.3) - rack (>= 1.0.0) - rack-test (>= 0.5.4) - xpath (~> 2.0) - cliver (0.3.2) coderay (1.1.1) - colorize (0.7.7) crack (0.4.3) safe_yaml (~> 1.0.0) field_serializer (0.2.0) @@ -48,28 +30,18 @@ GEM hashdiff (0.3.0) httpclient (2.7.1) method_source (0.8.2) - mime-types (3.0) - mime-types-data (~> 3.2015) - mime-types-data (3.2016.0221) mini_portile2 (2.1.0) - multi_json (1.11.2) nokogiri (1.6.8.1) mini_portile2 (~> 2.1.0) - open-uri-cached (0.0.5) - poltergeist (1.9.0) - capybara (~> 2.1) - cliver (~> 0.3.1) - multi_json (~> 1.0) - websocket-driver (>= 0.2.0) pry (0.10.3) coderay (~> 1.1.0) method_source (~> 0.8.1) slop (~> 3.4) public_suffix (2.0.4) - rack (1.6.4) - rack-test (0.6.3) - rack (>= 1.0) safe_yaml (1.0.4) + scraped_page_archive (0.5.0) + git (~> 1.3.0) + vcr-archive (~> 0.3.0) slop (3.6.0) sqlite3 (1.3.11) sqlite_magic (0.0.6) @@ -82,24 +54,13 @@ GEM addressable (>= 2.3.6) crack (>= 0.3.2) hashdiff - websocket-driver (0.6.3) - websocket-extensions (>= 0.1.0) - websocket-extensions (0.1.2) - xpath (2.0.0) - nokogiri (~> 1.3) PLATFORMS ruby DEPENDENCIES - capybara - colorize - nokogiri - open-uri-cached - poltergeist pry scraped_page! - scraped_page_archive! scraperwiki! RUBY VERSION From 95cad54f9caed557063bf02961ae720a49485af2 Mon Sep 17 00:00:00 2001 From: Chris Mytton <chrismytton@gmail.com> Date: Mon, 20 Feb 2017 10:51:01 +0100 Subject: [PATCH 20/20] Add open-uri-cached This caches a copy of fetched webpages to make the scraper quicker to develop locally. --- .gitignore | 1 + Gemfile | 1 + Gemfile.lock | 4 +++- scraper.rb | 3 +++ 4 files changed, 8 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 66d464d5a..2c7b194ce 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,3 @@ # Ignore output of scraper data.sqlite +.cache diff --git a/Gemfile b/Gemfile index 09297b106..71dbac8b5 100644 --- a/Gemfile +++ b/Gemfile @@ -10,3 +10,4 @@ ruby "2.3.1" gem "scraperwiki", github: "openaustralia/scraperwiki-ruby", branch: "morph_defaults" gem "pry" gem 'scraped_page', github: 'everypolitician/scraped_page', branch: 'master' +gem 'open-uri-cached' diff --git a/Gemfile.lock b/Gemfile.lock index 045b1e02e..4e948b80d 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -33,6 +33,7 @@ GEM mini_portile2 (2.1.0) nokogiri (1.6.8.1) mini_portile2 (~> 2.1.0) + open-uri-cached (0.0.5) pry (0.10.3) coderay (~> 1.1.0) method_source (~> 0.8.1) @@ -59,6 +60,7 @@ PLATFORMS ruby DEPENDENCIES + open-uri-cached pry scraped_page! scraperwiki! @@ -67,4 +69,4 @@ RUBY VERSION ruby 2.3.1p112 BUNDLED WITH - 1.13.6 + 1.13.7 diff --git a/scraper.rb b/scraper.rb index fe1406276..bf83c4b1a 100644 --- a/scraper.rb +++ b/scraper.rb @@ -4,6 +4,9 @@ require_relative 'lib/members_list_page' require_relative 'lib/member_page' +require 'open-uri/cached' +OpenURI::Cache.cache_path = '.cache' + url = 'http://www.congreso.es/portal/page/portal/Congreso/Congreso/Diputados/DiputadosTodasLegislaturas' loop do