diff --git a/lib/mindee/input/sources/local_input_source.rb b/lib/mindee/input/sources/local_input_source.rb index 8ebbc57b..a34261bd 100644 --- a/lib/mindee/input/sources/local_input_source.rb +++ b/lib/mindee/input/sources/local_input_source.rb @@ -16,6 +16,7 @@ module Source ALLOWED_MIME_TYPES = [ 'application/pdf', 'image/heic', + 'image/heif', 'image/png', 'image/jpeg', 'image/tiff', @@ -37,11 +38,7 @@ class LocalInputSource def initialize(io_stream, filename, repair_pdf: false) @io_stream = io_stream @filename = filename - @file_mimetype = if repair_pdf - Marcel::MimeType.for @io_stream - else - Marcel::MimeType.for @io_stream, name: @filename - end + @file_mimetype = detect_mime_type(repair_pdf) if ALLOWED_MIME_TYPES.include? @file_mimetype logger.debug("Loaded new input #{@filename} from #{self.class}") return @@ -197,6 +194,28 @@ def source_text? Mindee::PDF::PDFTools.source_text?(@io_stream) end + + private + + # Checks the mimetype for the file. If it is a PDF, it will attempt to repair it if repair_pdf is true. + # @param repair_pdf [bool] Whether to attempt to repair the PDF. + # @return [String] The mimetype of the file. + def detect_mime_type(repair_pdf) + return Marcel::MimeType.for(@io_stream) if repair_pdf + + heif_mimetype_from_extension || Marcel::MimeType.for(@io_stream, name: @filename) + end + + # Checks the file extension for a HEIF mimetype. + # @return [String, nil] The mimetype if found, nil otherwise. + def heif_mimetype_from_extension + case File.extname(@filename.to_s).downcase + when '.heic' + 'image/heic' + when '.heif' + 'image/heif' + end + end end # Replaces non-ASCII characters by their UNICODE escape sequence. diff --git a/sig/mindee/input/sources/local_input_source.rbs b/sig/mindee/input/sources/local_input_source.rbs index 103ae5d8..97764a8f 100644 --- a/sig/mindee/input/sources/local_input_source.rbs +++ b/sig/mindee/input/sources/local_input_source.rbs @@ -23,6 +23,12 @@ module Mindee def write_to_file: (String?) -> void def compress!: (?quality: Integer, ?max_width: Integer?, ?max_height: Integer?, ?force_source_text: bool, ?disable_source_text: bool) -> Integer def source_text?: -> bool? + + private + + def detect_mime_type: (bool) -> String + + def heif_mimetype_from_extension: -> String? end def self.convert_to_unicode_escape: (String) -> String end diff --git a/spec/data b/spec/data index 13093f3a..2d7fcf8f 160000 --- a/spec/data +++ b/spec/data @@ -1 +1 @@ -Subproject commit 13093f3a48de212ef26889df71199c1a2a9d1478 +Subproject commit 2d7fcf8f591f6d7f40e39862965325e6a8a21874 diff --git a/spec/input/sources/sources_spec.rb b/spec/input/sources/sources_spec.rb index ea38ff6b..c070985d 100644 --- a/spec/input/sources/sources_spec.rb +++ b/spec/input/sources/sources_spec.rb @@ -54,6 +54,24 @@ expect(input_source.page_count).to eq(1) expect(input_source.pdf?).to eq(false) end + + it 'should load a HEIC from a path', :all_deps do + input_source = Mindee::Input::Source::PathInputSource.new( + File.join(FILE_TYPES_DIR, 'receipt.heif') + ) + expect(input_source.file_mimetype).to eq('image/heif') + expect(input_source.filename).to eq('receipt.heif') + expect(input_source.page_count).to eq(1) + expect(input_source.pdf?).to eq(false) + + input_source2 = Mindee::Input::Source::PathInputSource.new( + File.join(FILE_TYPES_DIR, 'receipt.jpg.heif') + ) + expect(input_source2.file_mimetype).to eq('image/heif') + expect(input_source2.filename).to eq('receipt.jpg.heif') + expect(input_source2.page_count).to eq(1) + expect(input_source2.pdf?).to eq(false) + end end context 'A PDF input file', :all_deps do diff --git a/spec/v2/file_operations/crop_operation_integration.rb b/spec/v2/file_operations/crop_operation_integration.rb index 701d2a69..ab3b9ca9 100644 --- a/spec/v2/file_operations/crop_operation_integration.rb +++ b/spec/v2/file_operations/crop_operation_integration.rb @@ -65,7 +65,7 @@ def check_findoc_return(findoc_response) extracted_images.save_all_to_disk(OUTPUT_DIR) - expect(File.size(File.join(OUTPUT_DIR, 'crop_001.jpg'))).to be_between(560_000, 675_000) - expect(File.size(File.join(OUTPUT_DIR, 'crop_002.jpg'))).to be_between(580_000, 680_000) + expect(File.size(File.join(OUTPUT_DIR, 'crop_001.jpg'))).to be_between(560_000, 700_000) + expect(File.size(File.join(OUTPUT_DIR, 'crop_002.jpg'))).to be_between(580_000, 700_000) end end