Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adds rake task for jhove checkup #1619

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 20 additions & 0 deletions app/jobs/jhove_checkup_job.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
# frozen_string_literal: true
require 'nokogiri'

class JhoveCheckupJob < Hyrax::ApplicationJob
def perform(jhove_path, tifs_path)
tif_file_paths = Dir.glob("#{tifs_path}/**/*.tif")
CSV.open("config/emory/problem_files.csv", "w") do |csv|
tif_file_paths.each do |file|
xml_output, errors, status = Open3.capture3("#{jhove_path} -m TIFF-hul -h XML #{file}")
Sidekiq.logger.info "There was an error running JHOVE for #{file} #{errors}" unless status&.success?
document = Nokogiri::XML(xml_output)
# making status text lower case and then checking if "not" exists since there could be couple of
# instances for a bad file as follows:
# Eg: `Not well-formed`
# Eg: `Well-formed, but not valid`
csv << [file] if document.css('//status').to_xml.downcase.include?("not")
end
end
end
end
11 changes: 11 additions & 0 deletions lib/tasks/jhove_check.rake
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
# frozen_string_literal: true
namespace :curate do
namespace :jhove do
desc "Perform jhove checkup on files in a collection"
task jhove_check: :environment do
directory = ENV['base_dir']
jhove_path = ENV['jhove_path'] || 'opt/jhove/jhove'
JhoveCheckupJob.perform_later(jhove_path, directory)
end
end
end
19 changes: 19 additions & 0 deletions spec/fixtures/jhove_check_xml/not_well_formatted.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
<?xml version="1.0" encoding="UTF-8"?>
<jhove xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns="http://schema.openpreservation.org/ois/xml/ns/jhove" xsi:schemaLocation="http://schema.openpreservation.org/ois/xml/ns/jhove https://schema.openpreservation.org/ois/xml/xsd/jhove/1.8/jhove.xsd" name="Jhove" release="1.24.1" date="2020-03-16">
<date>2021-06-01T21:36:48-04:00</date>
<repInfo uri="/Users/dmatlaw/Downloads/jhove-1.24.1/jhove-core/src/main/examples/tiff/libtiff_v3/smallliz.tif">
<reportingModule release="1.9.2" date="2019-12-10">TIFF-hul</reportingModule>
<lastModified>2020-02-27T07:06:46-05:00</lastModified>
<size>5052</size>
<format>TIFF</format>
<status>Not well-formed</status>
<sigMatch>
<module>TIFF-hul</module>
</sigMatch>
<messages>
<message severity="info" id="TIFF-HUL-61">TIFF compression scheme 6 is deprecated</message>
<message severity="error" id="TIFF-HUL-7">Type mismatch for tag 532; expecting 5, saw 4</message>
</messages>
<mimeType>image/tiff</mimeType>
</repInfo>
</jhove>
202 changes: 202 additions & 0 deletions spec/fixtures/jhove_check_xml/well_formatted.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,202 @@
<?xml version="1.0" encoding="UTF-8"?>
<jhove xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns="http://schema.openpreservation.org/ois/xml/ns/jhove" xsi:schemaLocation="http://schema.openpreservation.org/ois/xml/ns/jhove https://schema.openpreservation.org/ois/xml/xsd/jhove/1.8/jhove.xsd" name="Jhove" release="1.24.1" date="2020-03-16">
<date>2021-06-01T21:26:57-04:00</date>
<repInfo uri="/Users/dmatlaw/rails_apps/dlp-curate/spec/fixtures/book_page/0003_preservation_master.tif">
<reportingModule release="1.9.2" date="2019-12-10">TIFF-hul</reportingModule>
<lastModified>2020-11-11T17:15:12-05:00</lastModified>
<size>32204</size>
<format>TIFF</format>
<version>4.0</version>
<status>Well-Formed and valid</status>
<sigMatch>
<module>TIFF-hul</module>
</sigMatch>
<mimeType>image/tiff</mimeType>
<profiles>
<profile>TIFF/IT-BP/P2 (ISO 12639:1998)</profile>
</profiles>
<properties>
<property>
<name>TIFFMetadata</name>
<values arity="Array" type="Property">
<property>
<name>ByteOrder</name>
<values arity="Scalar" type="String">
<value>little-endian</value>
</values>
</property>
<property>
<name>IFDs</name>
<values arity="List" type="Property">
<property>
<name>Number</name>
<values arity="Scalar" type="Integer">
<value>1</value>
</values>
</property>
<property>
<name>IFD</name>
<values arity="Array" type="Property">
<property>
<name>Offset</name>
<values arity="Scalar" type="Long">
<value>32014</value>
</values>
</property>
<property>
<name>Type</name>
<values arity="Scalar" type="String">
<value>TIFF</value>
</values>
</property>
<property>
<name>Entries</name>
<values arity="List" type="Property">
<property>
<name>NisoImageMetadata</name>
<values arity="Scalar" type="NISOImageMetadata">
<value>
<mix:mix xmlns:mix="http://www.loc.gov/mix/v20" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.loc.gov/mix/v20 http://www.loc.gov/standards/mix/mix20/mix20.xsd">
<mix:BasicDigitalObjectInformation>
<mix:ObjectIdentifier>
<mix:objectIdentifierType>JHOVE</mix:objectIdentifierType>
</mix:ObjectIdentifier>
<mix:FormatDesignation>
<mix:formatName>image/tiff</mix:formatName>
</mix:FormatDesignation>
<mix:byteOrder>little endian</mix:byteOrder>
<mix:Compression>
<mix:compressionScheme>Group 4 Fax</mix:compressionScheme>
</mix:Compression>
</mix:BasicDigitalObjectInformation>
<mix:BasicImageInformation>
<mix:BasicImageCharacteristics>
<mix:imageWidth>1800</mix:imageWidth>
<mix:imageHeight>2400</mix:imageHeight>
<mix:PhotometricInterpretation>
<mix:colorSpace>WhiteIsZero</mix:colorSpace>
</mix:PhotometricInterpretation>
</mix:BasicImageCharacteristics>
</mix:BasicImageInformation>
<mix:ImageCaptureMetadata>
<mix:orientation>normal*</mix:orientation>
</mix:ImageCaptureMetadata>
<mix:ImageAssessmentMetadata>
<mix:SpatialMetrics>
<mix:samplingFrequencyUnit>in.</mix:samplingFrequencyUnit>
<mix:xSamplingFrequency>
<mix:numerator>629145600</mix:numerator>
<mix:denominator>2097152</mix:denominator>
</mix:xSamplingFrequency>
<mix:ySamplingFrequency>
<mix:numerator>629145600</mix:numerator>
<mix:denominator>2097152</mix:denominator>
</mix:ySamplingFrequency>
</mix:SpatialMetrics>
<mix:ImageColorEncoding>
<mix:BitsPerSample>
<mix:bitsPerSampleValue>1</mix:bitsPerSampleValue>
<mix:bitsPerSampleUnit>integer</mix:bitsPerSampleUnit>
</mix:BitsPerSample>
<mix:samplesPerPixel>1</mix:samplesPerPixel>
</mix:ImageColorEncoding>
</mix:ImageAssessmentMetadata>
</mix:mix>
</value>
</values>
</property>
<property>
<name>NewSubfileType</name>
<values arity="Scalar" type="Long">
<value>0</value>
</values>
</property>
<property>
<name>SampleFormat</name>
<values arity="Array" type="Integer">
<value>1</value>
</values>
</property>
<property>
<name>MinSampleValue</name>
<values arity="Array" type="Integer">
<value>0</value>
</values>
</property>
<property>
<name>MaxSampleValue</name>
<values arity="Array" type="Integer">
<value>1</value>
</values>
</property>
<property>
<name>Threshholding</name>
<values arity="Scalar" type="Integer">
<value>1</value>
</values>
</property>
<property>
<name>T6Options</name>
<values arity="Scalar" type="Long">
<value>0</value>
</values>
</property>
<property>
<name>TIFFITProperties</name>
<values arity="List" type="Property">
<property>
<name>BackgroundColorIndicator</name>
<values arity="Scalar" type="String">
<value>background not defined</value>
</values>
</property>
<property>
<name>ImageColorIndicator</name>
<values arity="Scalar" type="String">
<value>image not defined</value>
</values>
</property>
<property>
<name>TransparencyIndicator</name>
<values arity="Scalar" type="String">
<value>no transparency</value>
</values>
</property>
<property>
<name>PixelIntensityRange</name>
<values arity="Array" type="Integer">
<value>0</value>
<value>1</value>
</values>
</property>
<property>
<name>RasterPadding</name>
<values arity="Scalar" type="String">
<value>1 byte</value>
</values>
</property>
<property>
<name>BitsPerRunLength</name>
<values arity="Scalar" type="Integer">
<value>8</value>
</values>
</property>
<property>
<name>BitsPerExtendedRunLength</name>
<values arity="Scalar" type="Integer">
<value>16</value>
</values>
</property>
</values>
</property>
</values>
</property>
</values>
</property>
</values>
</property>
</values>
</property>
</properties>
</repInfo>
</jhove>
32 changes: 32 additions & 0 deletions spec/jobs/jhove_checkup_job_spec.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
# frozen_string_literal: true
require 'rails_helper'

RSpec.describe JhoveCheckupJob, :clean do
Sidekiq.logger.level = Logger::WARN
let(:csv_path) { File.join("config/emory/problem_files.csv") }
let(:csv) { IO.read(csv_path) }
let(:good_xml) { File.read(fixture_path + '/jhove_check_xml/well_formatted.xml') }
let(:bad_xml) { File.read(fixture_path + '/jhove_check_xml/not_well_formatted.xml') }
let(:good_file_path) { "/Users/dmatlaw/rails_apps/dlp-curate/spec/fixtures/book_page/0003_preservation_master.tif" }
let(:bad_file_path) { "/Users/dmatlaw/Downloads/jhove-1.24.1/jhove-core/src/main/examples/tiff/libtiff_v3/smallliz.tif" }

after do
File.delete(csv_path) if File.exist?(csv_path)
end

context 'with a good and bad file' do
let(:jhove_command_bad) { "some_jhove_path -m TIFF-hul -h XML #{bad_file_path}" }
let(:jhove_command_good) { "some_jhove_path -m TIFF-hul -h XML #{good_file_path}" }
before do
allow(Open3).to receive(:capture3).with(jhove_command_good).and_return(good_xml, "", "")
allow(Open3).to receive(:capture3).with(jhove_command_bad).and_return(bad_xml, "", "")
allow(Dir).to receive(:glob).and_return([good_file_path, bad_file_path])
described_class.perform_now("some_jhove_path", "fixture_path")
end

it 'adds bad filepath to csv but does not add good file' do
expect(csv).not_to include(good_file_path)
expect(csv).to include(bad_file_path)
end
end
end