1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
use std::borrow::Cow;
use std::convert::TryFrom;
use std::ffi::OsString;
use std::fmt;
use std::sync::Arc;
use grep::printer::{ColorSpecs, StandardBuilder};
use grep::regex::RegexMatcherBuilder;
use grep::searcher::SearcherBuilder;
use poppler::PopplerDocument;
use rayon::iter::{IntoParallelIterator, ParallelIterator};
use skim::{AnsiString, DisplayContext, ItemPreview, PreviewContext, SkimItem};
use termcolor::Ansi;
#[derive(Debug)]
pub struct PDFContent {
pub file_path: OsString,
pub content: String,
}
impl SkimItem for PDFContent {
fn display(&self, _: DisplayContext) -> AnsiString {
self.file_path.as_os_str().to_str().unwrap().into()
}
fn text(&self) -> Cow<str> {
Cow::Borrowed(&self.content)
}
fn preview(&self, context: PreviewContext) -> ItemPreview {
let matcher = RegexMatcherBuilder::new()
.case_smart(true)
.build(context.query)
.unwrap();
let width = context.width as u64;
let mut printer = StandardBuilder::new()
.stats(false)
.color_specs(ColorSpecs::default_with_color())
.max_columns(Some(width))
.max_columns_preview(true)
.build(Ansi::new(vec![]));
let context = crate::CONFIG.read().unwrap().context;
let mut searcher = SearcherBuilder::new()
.line_number(false)
.after_context(context)
.before_context(context)
.build();
let _ = searcher.search_slice(&matcher, self.content.as_bytes(), printer.sink(&matcher));
ItemPreview::AnsiText(String::from_utf8(printer.into_inner().into_inner()).unwrap())
}
}
impl TryFrom<OsString> for PDFContent {
type Error = (ParsingError, OsString);
fn try_from(file_path: OsString) -> Result<Self, Self::Error> {
let document = match PopplerDocument::from_file(&file_path, "") {
Ok(pdf_doc) => pdf_doc,
Err(_) => return Err((ParsingError::NotAPDF, file_path)),
};
let num_pages = document.n_pages();
let max_num_pages = crate::CONFIG.read().unwrap().max_pages;
if max_num_pages != 0 && max_num_pages < num_pages {
return Err((ParsingError::TooManyPages, file_path));
}
let document_arc = Arc::new(document);
let content: String = (0..num_pages)
.into_par_iter()
.map(|page_idx| {
Arc::clone(&document_arc)
.page(page_idx)
.map(|page| page.owned_text())
.flatten()
.unwrap_or_default()
})
.collect();
if content.chars().all(|ch| ch.is_whitespace()) {
Err((ParsingError::EmptyFile, file_path))
} else {
Ok(PDFContent { file_path, content })
}
}
}
pub enum ParsingError {
NotAPDF,
EmptyFile,
TooManyPages,
}
impl fmt::Debug for ParsingError {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match self {
ParsingError::NotAPDF => write!(f, "file couldn't be read as a pdf"),
ParsingError::EmptyFile => write!(f, "no text could be recognized from this file"),
ParsingError::TooManyPages => write!(f, "has too many pages"),
}
}
}