Read text from a PDF document without using ActiveX in delphi

// Read text from a PDF document without using ActiveX

{+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

This tip show the way to cath the whole text of a PDF document.

You will need:
– 1 TMemo, 5 TLabel, 1 TButton and 1 OpenDialog
– to import the typelibrary from Adobe Acrobat (look fo Acrobat.tbl)

+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++}

unit Unit1;

interface

uses
Windows, Messages, SysUtils, Variants, Classes, Graphics, Controls, Forms,
Dialogs, StdCtrls, OleCtrls, acrobat_tlb;

type
TForm1 = class(TForm)
Button1: TButton;
Memo1: TMemo;
OpenDialog1: TOpenDialog;
GroupBox1: TGroupBox;
Label1: TLabel;
Label2: TLabel;
Label3: TLabel;
Label4: TLabel;
Label5: TLabel;
procedure Button1Click(Sender: TObject);
private
{ Private-Deklarationen }
public
{ Public-Deklarationen }
end;

var
Form1: TForm1;

implementation

uses ComObj;

{$R *.dfm}
{$TYPEDADDRESS OFF} //muss so sein (this have to be)
var
PDDoc: Acrobat_TLB.CAcroPDDoc;
PDPage: Variant;
PDHili: Variant;
PDTextS: Variant;
acrobat: Variant;
Result: Boolean;
NTL, i, j, Pagecount: Integer;
zeilen: string;
stichwortcounter: Integer;
Size: Integer;
gesamtstring: AnsiString;
zwreal: Real;

procedure TForm1.Button1Click(Sender: TObject);
function removecrlf(workstring: string): string;
var
i: Integer;
begin
removecrlf := ”;
for i := 0 to Length(workstring) do
begin
if workstring[i] = #13 then
workstring[i] := ‘ ‘;
if workstring[i] = #10 then
workstring[i] := ‘ ‘;
end;

removecrlf := workstring;
end;
begin
if not opendialog1.Execute then Exit;

memo1.Clear;

gesamtstring := ”;
stichwortcounter := 0;
Size := 0;
try

//Object erstellen
acrobat := CreateOleObject(‘AcroExch.pdDoc’);

//PDF Datei in Object ِffnen
Result := acrobat.Open(opendialog1.FileName);

if Result = False then
begin
messagedlg(‘Kann Datei nicht ِffnen’, mtWarning, [mbOK], 0);
Exit;
end;

for j := 0 to acrobat.GetNumPages – 1 do
begin
memo1.Lines.Add(‘———————————————-‘);
//Erste Seite des Dokuments aktiv setzen (first page)
PDPage := acrobat.acquirePage(j);

//Ein Highlight Object mit 2000 Elementen erzeugen
PDHili := CreateOleObject(‘AcroExch.HiliteList’);
Result := PDHili.Add(0, 4096);

//Erzeuge eine Markierung über den ganzen Text
PDTextS := PDPage.CreatePageHilite(PDHili);

ntl := PDTextS.GetNumText;

for i := 0 to ntl – 1 do
begin
zeilen := PDTextS.GetText(i);
if (Length(zeilen) > 0) and (zeilen <> ”) then
memo1.Lines.Add(removecrlf(zeilen));
gesamtstring := gesamtstring + removecrlf(zeilen);
//nur für statistik
Size := Size + SizeOf(zeilen);
Inc(stichwortcounter);

Application.ProcessMessages;
end;

//Wieder freigeben
pdhili := Unassigned;
pdtextS := Unassigned;
pdpage := Unassigned;
label2.Caption := IntToStr(stichwortcounter);
label4.Caption := IntToStr(Size);
label2.Refresh;
label4.Refresh;
end; //for i to pagecount

except
on e: Exception do
begin
messagedlg(‘Fehler: ‘ + e.Message, mtError, [mbOK], 0);
Exit;
end;
end;
if Size > 1024 then
begin
zwreal := Size / 1024;
str(zwreal: 2: 1,zeilen);
label4.Caption := zeilen;
label5.Caption := ‘KB’;
end;
memo1.Lines.SaveToFile(Extractfilepath(Application.exename) + ‘\debug.txt’);
end;

end.