forked from jourdant/powershell-paperless
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathitextsharplib.psm1
43 lines (34 loc) · 1.05 KB
/
itextsharplib.psm1
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
#
# Title: itextsharplib.psm1
# Author: Jourdan Templeton
# Email: [email protected]
# Modified: 10/01/2015 21:49PM NZDT
#
Add-Type -Path "$PSScriptRoot\Lib\itextsharp.dll"
<#
.SYNOPSIS
This cmdlet loads a PDF file and returns the text content.
.DESCRIPTION
This cmdlet loads a PDF file and returns the text content. NOTE: this only applies to documents that have text fields embedded. This does not apply to text contained in images of the PDF.
.PARAMETER Path
The path to the image to be processed.
.EXAMPLE
Get-ItsTextFromImage -Path "C:\temp\test.pdf"
.EXAMPLE
$text = Get-ChildItem "C:\Temp" -Filter *.pdf | Get-ItsTextFromImage
#>
Function Get-ItsTextFromPdf()
{
Param(
[Parameter(Mandatory=$true, ValueFromPipeline=$true)][Alias("FullName")][String]$Path
)
Process {
#construct reader object and prepare for reading
$reader = New-Object iTextSharp.text.pdf.PdfReader($Path)
#read pdf
$ret = [iTextSharp.text.pdf.parser.PdfTextExtractor]::GetTextFromPage($reader, 1)
#clean up references
$reader.Dispose()
return $ret
}
}