Re: Ideas: How to rip text from PDF files
- From: "Umer" <umer41@xxxxxxxxxxx>
- Date: Fri, 24 Jun 2005 12:25:47 +0500
You know what you can read a pdf using a pdf ifilter and for that you will
need to go through a real load of heck!
Download the iFilter from
http://www.adobe.com/support/downloads/detail.jsp?ftpID=2611 if u dont have
it!
Alright this code is using ComInterop to Interop with the PDF iFilter
Umer.
"Peter Qian" <pmouse@xxxxxxxxx> wrote in message
news:rojue.1077$ri.920@xxxxxxxxxxxxxxxxxxxxx
> Hi Guys,
> My goal is simple, given a pdf file, encrypted or not, i need to rip
> the
> text out from there. Just plain text, no formatting needed.
> Any idea how may I accomplish this in DotNet?
>
> Regards,
>
> Peter
>
>
begin 666 PDFFileReader.txt
M#0H-"B\O($-/1$4@5$A!5"!54T53(%1(12!"14Q/5R!#3$%34PT*<'5B;&EC
M('-T871I8R!V;VED($UA:6XH*0T*>PT*("!/9F9I8V5&:6QE4F5A9&5R+E!$
M1E)E861E<B!O8FI/1E(@/2!N97<@4$1&4F5A9&5R+E!$1E)E861E<B@I#0H@
M('-T<FEN9R!O=71P=70](B([#0H@(&]B:D]&4BY'971497AT*")#.EQ<37E7
M;W)D1FEL92Y$;V,B+"!R968@;W5T<'5T*3L-"B @0V]N<V]L92Y7<FET94QI
M;F4H;W5T<'5T*3L-"GT-"@T*+R\O/3T]/3T]/3T]/3T]/3T]/3T]/3T]/3T]
M/3T]/3T]/3T]/3T]/3T]/3T]/3T]/3T]/3T]/3T]/3T]/3T]/3T-"@T*+R\O
M($]F9FEC92!&:6QE(%)E861E<@T*#0HO+R\]/3T]/3T]/3T]/3T]/3T]/3T]
M/3T]/3T]/3T]/3T]/3T]/3T]/3T]/3T]/3T]/3T]/3T]/3T]/3T]/3T]/0T*
M#0IU<VEN9R!3>7-T96T[#0IU<VEN9R!3>7-T96TN5&5X=#L-"G5S:6YG(%-Y
M<W1E;2Y2=6YT:6UE+DEN=&5R;W!397)V:6-E<SL-"@T*#0IN86UE<W!A8V4@
M4$1&4F5A9&5R#0I[#0H@(" @(W)E9VEO;B!3='5F9B!Y;W4@1&]N="!E=F5N
M(&YE960@=&\@;&]O:R!A= T*(" @(%M&;&%G<UT-"@T*(" @('!U8FQI8R!E
M;G5M($E&24Q415)?24Y)5 T*(" @('L-"B @(" @(" @3D].12 ](# L#0H@
M(" @(" @($-!3D].7U!!4D%'4D%02%,@/2 Q+ T*(" @(" @("!(05)$7TQ)
M3D5?0E)%04M3(#T@,BP-"B @(" @(" @0T%.3TY?2%E02$5.4R ](#0L#0H@
M(" @(" @($-!3D].7U-004-%4R ](#@L#0H@(" @(" @($%04$Q97TE.1$58
M7T%45%))0E5415,@/2 Q-BP-"B @(" @(" @05!03%E?0U)!5TQ?05144DE"
M551%4R ](#(U-BP-"B @(" @(" @05!03%E?3U1(15)?05144DE"551%4R ]
M(#,R+ T*(" @(" @("!)3D1%6$E.1U]/3DQ9(#T@-C0L#0H@(" @(" @(%-%
M05)#2%],24Y+4R ](#$R."P-"B @(" @(" @1DE,5$527T]73D5$7U9!3%5%
M7T]+(#T@-3$R#0H@(" @?0T*#0H-"@T*(" @(%M&;&%G<UT-"B @("!P=6)L
M:6,@96YU;2!)1DE,5$527T9,04=3#0H@(" @>PT*(" @(" @("!/3$5?4%)/
M4$525$E%4R ](#$-"B @("!]#0H-"@T*#0H@(" @<'5B;&EC(&5N=6T@0TA5
M3DM?0E)%04M465!%#0H@(" @>PT*(" @(" @("!#2%5.2U].3U]"4D5!2R ]
M(# L#0H@(" @(" @($-(54Y+7T5/5R ](#$L#0H@(" @(" @($-(54Y+7T5/
M4R ](#(L#0H@(" @(" @($-(54Y+7T5/4" ](#,L#0H@(" @(" @($-(54Y+
M7T5/0R ](#0-"B @("!]#0H-"@T*#0H@(" @6T9L86=S70T*(" @('!U8FQI
M8R!E;G5M($-(54Y+4U1!5$4-"B @("![#0H@(" @(" @($-(54Y+7U1%6%0@
M/2 P>#$L#0H@(" @(" @($-(54Y+7U9!3%5%(#T@,'@R+ T*(" @(" @("!#
M2%5.2U]&24Q415)?3U=.141?5D%,544@/2 P>#0-"B @("!]#0H-"@T*#0H@
M(" @<'5B;&EC(&5N=6T@4%-+24Y$#0H@(" @>PT*(" @(" @("!,4%=35%(@
M/2 P+ T*(" @(" @("!04D]0240@/2 Q#0H@(" @?0T*#0H-"@T*(" @(%M3
M=')U8W1,87EO=70H3&%Y;W5T2VEN9"Y397%U96YT:6%L*5T-"B @("!P=6)L
M:6,@<W1R=6-T(%!23U!34$5##0H@(" @>PT*(" @(" @("!P=6)L:6,@=6EN
M="!U;$MI;F0[#0H@(" @(" @('!U8FQI8R!U:6YT('!R;W!I9#L-"B @(" @
M(" @<'5B;&EC($EN=%!T<B!L<'=S='([#0H@(" @?0T*#0H-"@T*(" @(%M3
M=')U8W1,87EO=70H3&%Y;W5T2VEN9"Y397%U96YT:6%L*5T-"@T*(" @('!U
M8FQI8R!S=')U8W0@1E5,3%!23U!34$5##0H@(" @>PT*(" @(" @("!P=6)L
M:6,@1W5I9"!G=6ED4')O<%-E=#L-"B @(" @(" @<'5B;&EC(%!23U!34$5#
M('!S4')O<&5R='D[#0H@(" @?0T*#0H-"@T*(" @(%M3=')U8W1,87EO=70H
M3&%Y;W5T2VEN9"Y397%U96YT:6%L*5T-"B @("!P=6)L:6,@<W1R=6-T(%-4
M051?0TA53DL-"B @("![#0H@(" @(" @('!U8FQI8R!U:6YT(&ED0VAU;FL[
M#0H-"B @(" @(" @6TUA<G-H86Q!<RA5;FUA;F%G9614>7!E+E4T*5T-"B @
M(" @(" @<'5B;&EC($-(54Y+7T)214%+5%E012!B<F5A:U1Y<&4[#0H-"B @
M(" @(" @6TUA<G-H86Q!<RA5;FUA;F%G9614>7!E+E4T*5T-"B @(" @(" @
M<'5B;&EC($-(54Y+4U1!5$4@9FQA9W,[#0H-"B @(" @(" @<'5B;&EC('5I
M;G0@;&]C86QE.PT*#0H@(" @(" @(%M-87)S:&%L07,H56YM86YA9V5D5'EP
M92Y3=')U8W0I70T*(" @(" @("!P=6)L:6,@1E5,3%!23U!34$5#(&%T=')I
M8G5T93L-"@T*(" @(" @("!P=6)L:6,@=6EN="!I9$-H=6YK4V]U<F-E.PT*
M#0H@(" @(" @('!U8FQI8R!U:6YT(&-W8U-T87)T4V]U<F-E.PT*#0H@(" @
M(" @('!U8FQI8R!U:6YT(&-W8TQE;E-O=7)C93L-"B @("!]#0H-"@T*#0H@
M(" @6U-T<G5C=$QA>6]U="A,87EO=71+:6YD+E-E<75E;G1I86PI70T*#0H@
M(" @<'5B;&EC('-T<G5C="!&24Q415)214=)3TX-"B @("![#0H@(" @(" @
M('!U8FQI8R!U:6YT(&ED0VAU;FL[#0H@(" @(" @('!U8FQI8R!U:6YT(&-W
M8U-T87)T.PT*(" @(" @("!P=6)L:6,@=6EN="!C=V-%>'1E;G0[#0H@(" @
M?0T*#0H-"B @(" C96YD<F5G:6]N#0H-"B @("!;0V]M26UP;W)T70T*#0H@
M(" @6T=U:60H(C@Y0D-"-S0P+38Q,3DM,3 Q02U"0T(W+3 P1$0P,3 V-35!
M1B(I70T*(" @(%M);G1E<F9A8V54>7!E*$-O;4EN=&5R9F%C951Y<&4N26YT
M97)F86-E27-)56YK;F]W;BE=#0H@(" @<'5B;&EC(&EN=&5R9F%C92!)1FEL
M=&5R#0H@(" @>PT*(" @(" @("!V;VED($EN:70H6TUA<G-H86Q!<RA5;FUA
M;F%G9614>7!E+E4T*5T@249)3%1%4E])3DE4(&=R9D9L86=S+ T*(" @(" @
M(" @(" @(" @(" @=6EN="!C071T<FEB=71E<RP-"B @(" @(" @(" @(" @
M(" @(%M-87)S:&%L07,H56YM86YA9V5D5'EP92Y,4$%R<F%Y+"!3:7IE4&%R
M86U);F1E>" ](#$I72!&54Q,4%)/4%-014-;72!A071T<FEB=71E<RP-"B @
M(" @(" @(" @(" @(" @(')E9B!U:6YT('!D=T9L86=S*3L-"@T*(" @(" @
M("!V;VED($=E=$-H=6YK*%M-87)S:&%L07,H56YM86YA9V5D5'EP92Y3=')U
M8W0I72!O=70@4U1!5%]#2%5.2R!P4W1A="D[#0H-"B @(" @(" @6U!R97-E
M<G9E4VEG70T*(" @(" @("!I;G0@1V5T5&5X="AR968@=6EN="!P8W=C0G5F
M9F5R+"!;36%R<VAA;$%S*%5N;6%N86=E9%1Y<&4N3%!74W1R*5T@4W1R:6YG
M0G5I;&1E<B!B=69F97(I.PT*#0H@(" @(" @('9O:60@1V5T5F%L=64H<F5F
M(%5);G10='(@<'!0<F]P5F%L=64I.PT*#0H@(" @(" @('9O:60@0FEN9%)E
M9VEO;BA;36%R<VAA;$%S*%5N;6%N86=E9%1Y<&4N4W1R=6-T*5U&24Q415)2
M14=)3TX@;W)I9U!O<RP@<F5F($=U:60@<FEI9"P@<F5F(%5);G10='(@<'!U
M;FLI.PT*#0H@(" @?0T*#0H-"@T*(" @(%M#;VU);7!O<G1=#0H@(" @+R\@
M1F]R(')E861I;F<@;V9F:6-E(&9I;&5S(&QI:V4@+F1O8R N>&QS(&%N9" N
M<'!T6T=U:60H(F8P-V8S.3(P+3=B.&,M,3%C9BTY8F4X+3 P86$P,#1B.3DX
M-B(I70T*(" @(%M'=6ED*"(T0SDP-#0T."TW-$$Y+3$Q9# M048V12TP,$,P
M-$9$.$1#,#(B*5T-"B @("!P=6)L:6,@8VQA<W,@0T9I;'1E<@T*(" @('L-
M"@T*(" @('T-"@T*#0H-"@T*#0H@(" @<'5B;&EC(&-L87-S($-O;G-T86YT
M<PT*(" @('L-"@T*(" @(" @("!P=6)L:6,@8V]N<W0@=6EN="!0241?4U1'
M7T1)4D5#5$]262 ](#!X,# P,# P,#([#0H@(" @(" @('!U8FQI8R!C;VYS
M="!U:6YT(%!)1%]35$=?0TQ!4U-)1" ](#!X,# P,# P,#,[#0H@(" @(" @
M('!U8FQI8R!C;VYS="!U:6YT(%!)1%]35$=?4U1/4D%'15194$4@/2 P># P
M,# P,# T.PT*#0H@(" @(" @('!U8FQI8R!C;VYS="!U:6YT(%!)1%]35$=?
M5D],54U%7TE$(#T@,'@P,# P,# P-3L-"B @(" @(" @<'5B;&EC(&-O;G-T
M('5I;G0@4$E$7U-41U]005)%3E1?5T]22TE$(#T@,'@P,# P,# P-CL-"B @
M(" @(" @<'5B;&EC(&-O;G-T('5I;G0@4$E$7U-41U]314-/3D1!4EE35$]2
M12 ](#!X,# P,# P,#<[#0H-"B @(" @(" @<'5B;&EC(&-O;G-T('5I;G0@
M4$E$7U-41U]&24Q%24Y$15@@/2 P># P,# P,# X.PT*(" @(" @("!P=6)L
M:6,@8V]N<W0@=6EN="!0241?4U1'7TQ!4U1#2$%.1T554TX@/2 P># P,# P
M,# Y.PT*(" @(" @("!P=6)L:6,@8V]N<W0@=6EN="!0241?4U1'7TY!344@
M/2 P># P,# P,#!A.PT*(" @(" @("!P=6)L:6,@8V]N<W0@=6EN="!0241?
M4U1'7U!!5$@@/2 P># P,# P,#!B.PT*#0H@(" @(" @('!U8FQI8R!C;VYS
M="!U:6YT(%!)1%]35$=?4TE:12 ](#!X,# P,# P,&,[#0H@(" @(" @('!U
M8FQI8R!C;VYS="!U:6YT(%!)1%]35$=?05144DE"551%4R ](#!X,# P,# P
M,&0[#0H@(" @(" @('!U8FQI8R!C;VYS="!U:6YT(%!)1%]35$=?5U))5$54
M24U%(#T@,'@P,# P,# P93L-"B @(" @(" @<'5B;&EC(&-O;G-T('5I;G0@
M4$E$7U-41U]#4D5!5$5424U%(#T@,'@P,# P,# P9CL-"B @(" @(" @<'5B
M;&EC(&-O;G-T('5I;G0@4$E$7U-41U]!0T-%4U-424U%(#T@,'@P,# P,# Q
M,#L-"B @(" @(" @<'5B;&EC(&-O;G-T('5I;G0@4$E$7U-41U]#2$%.1T54
M24U%(#T@,'@P,# P,# Q,3L-"@T*(" @(" @("!P=6)L:6,@8V]N<W0@=6EN
M="!0241?4U1'7T-/3E1%3E13(#T@,'@P,# P,# Q,SL-"B @(" @(" @<'5B
M;&EC(&-O;G-T('5I;G0@4$E$7U-41U]32$]25$Y!344@/2 P># P,# P,#$T
M.PT*#0H@(" @(" @('!U8FQI8R!C;VYS="!I;G0@1DE,5$527T5?14Y$7T]&
M7T-(54Y+4R ]("AU;F-H96-K960H*&EN="DP>#@P,#0Q-S P*2D[#0H@(" @
M(" @('!U8FQI8R!C;VYS="!I;G0@1DE,5$527T5?3D]?34]215]415A4(#T@
M*'5N8VAE8VME9"@H:6YT*3!X.# P-#$W,#$I*3L-"B @(" @(" @<'5B;&EC
M(&-O;G-T(&EN="!&24Q415)?15].3U]-3U)%7U9!3%5%4R ]("AU;F-H96-K
M960H*&EN="DP>#@P,#0Q-S R*2D[#0H-"@T*(" @(" @("!P=6)L:6,@8V]N
M<W0@:6YT($9)3%1%4E]%7TY/7U1%6%0@/2 H=6YC:&5C:V5D*"AI;G0I,'@X
M,# T,3<P-2DI.PT*(" @(" @("!P=6)L:6,@8V]N<W0@:6YT($9)3%1%4E]%
M7TY/7U9!3%5%4R ]("AU;F-H96-K960H*&EN="DP>#@P,#0Q-S V*2D[#0H@
M(" @(" @('!U8FQI8R!C;VYS="!I;G0@1DE,5$527U-?3$%35%]415A4(#T@
M*'5N8VAE8VME9"@H:6YT*3!X,# P-#$W,#DI*3L-"B @("!]#0H@(" @<'5B
M;&EC(&-L87-S(%!$1E)E861E<@T*(" @('L@#0H@(" @(" @('!U8FQI8R!V
M;VED($=E=%1E>'0H4W1R:6YG('!A=&@L<F5F('-T<FEN9R!T97AT*0T*(" @
M(" @(" @(" @+R\@<&%T:"!I<R!T:&4@<&%T:"!O9B!T:&4@+G!D9B @9FEL
M90T*(" @(" @(" @(" @+R\@=&5X="!I<R!T:&4@=F%R:6%B;&4@:6X@=VAI
M8V@@86QL('1H92!E>'1R86-T960@=&5X="!W:6QL(&)E('-T;W)E9 T*(" @
M(" @("![#0H@(" @(" @(" @("!3=')I;F<@<F5S=6QT(#T@(B([#0H@(" @
M(" @(" @("!I;G0@8V]U;G0@/2 P.PT*(" @(" @(" @(" @=')Y#0H@(" @
M(" @(" @("
M1FEL=&5R*2AN97<@0T9I;'1E<B@I*3L-"B @(" @(" @(" @(" @(" O+U-Y
M<W1E;2Y2=6YT:6UE+DEN=&5R;W!397)V:6-E<RY50T]-25!E<G-I<W1&:6QE
M(&EP9B ]("A3>7-T96TN4G5N=&EM92Y);G1E<F]P4V5R=FEC97,N54-/34E0
M97)S:7-T1FEL92DH:69I;'0I.PT*(" @(" @(" @(" @(" @(%-Y<W1E;2Y2
M=6YT:6UE+DEN=&5R;W!397)V:6-E<RY#;VU4>7!E<RY)4&5R<VES=$9I;&4@
M:7!F/2 H4WES=&5M+E)U;G1I;64N26YT97)O<%-E<G9I8V5S+D-O;51Y<&5S
M+DE097)S:7-T1FEL92DH:69I;'0I.PT*(" @(" @(" @(" @(" @(&EP9BY,
M;V%D*$!P871H+" P*3L-"B @(" @(" @(" @(" @("!U:6YT(&D@/2 P.PT*
M(" @(" @(" @(" @(" @(%-4051?0TA53DL@<',@/2!N97<@4U1!5%]#2%5.
M2R@xxxx*(" @(" @(" @(" @(" @(&EF:6QT+DEN:70H249)3%1%4E])3DE4
M+DY/3D4L(# L(&YU;&PL(')E9B!I*3L-"B @(" @(" @(" @(" @("!I;G0@
M:'(@/2 P.PT*(" @(" @(" @(" @(" @#0H@(" @(" @(" @(" @(" @=VAI
M;&4@*&AR(#T](# I#0H@(" @(" @(" @(" @(" @>PT*(" @(" @(" @(" @
M(" @(" @( T*(" @(" @(" @(" @(" @(" @(" @(" @:69I;'0N1V5T0VAU
M;FLH;W5T('!S*3L-"B @(" @(" @(" @(" @(" @(" @(" @(&EF("AP<RYF
M;&%G<R ]/2!#2%5.2U-4051%+D-(54Y+7U1%6%0I#0H@(" @(" @(" @(" @
M(" @(" @(" @("?4U],05-47U1%6%0@
M?'P@:'(R(#T](# I#0H@(" @(" @(" @(" @(" @(" @(" @(" @(" @>PT*
M(" @(" @(" @(" @(" @(" @(" @(" @(" @(" @("!T<GD-"B @(" @(" @
M(" @(" @(" @(" @(" @(" @(" @(" @>PT*(" @(" @(" @(" @(" @(" @
M(" @(" @(" @(" @(" @(" @<&-W8T)U9F9E<B ](#$P,# [#0H@(" @(" @
M(" @(" @(" @(" @(" @(" @(" @(" @(" @("!3>7-T96TN5&5X="Y3=')I
M;F="=6EL9&5R('-B0G5F9F5R(#T@;F5W(%-T<FEN9T)U:6QD97(H*&EN="EP
M8W=C0G5F9F5R*3L-"B @(" @(" @(" @(" @(" @(" @(" @(" @(" @(" @
M(" @(&AR,B ](&EF:6QT+D=E=%1E>'0H<F5F('!C=V-"=69F97(L('-B0G5F
M9F5R*3L-"B @(" @(" @(" @(" @(" @(" @(" @(" @(" @(" @(" @("\O
M($-O;G-O;&4N5W)I=&5,:6YE*'!C=V-"=69F97(N5&]3=')I;F<H*2D[#0H@
M(" @(" @(" @(" @(" @(" @(" @(" @(" @(" @(" @("!I9B H:'(R(#X]
M(# I(')E<W5L=" K/2!S8D)U9F9E<BY4;U-T<FEN9R@P+" H:6YT*7!C=V-"
M=69F97(I.PT*(" @(" @(" @(" @(" @(" @(" @(" @(" @(" @(" @(" @
M+R]T97AT0F]X,2Y497AT("L](EQN(CL-"B @(" @(" @(" @(" @(" @(" @
M(" @(" @(" @(" @(" @("\O(')E<W5L=" K/2 B(R,C(R,C(R,C(R,C(R,C
M(R,C(R,C(R,C(R,C(R,C(R,C(R,C(R,C(R,B.PT*(" @(" @(" @(" @(" @
M(" @(" @(" @(" @(" @(" @(" @8V]U;G0K*SL-"B @(" @(" @(" @(" @
M(" @(" @(" @(" @(" @(" @?0T*(" @(" @(" @(" @(" @(" @(" @(" @
M(" @(" @("!C871C:" H4WES=&5M+E)U;G1I;64N26YT97)O<%-E<G9I8V5S
M+D-/345X8V5P=&EO;B!M>44I#0H@(" @(" @(" @(" @(" @(" @(" @(" @
M(" @(" @('L-"B @(" @(" @(" @(" @(" @(" @(" @(" @(" @(" @(" @
M($-O;G-O;&4N5W)I=&5,:6YE*&UY12Y$871A("L@(EQN(B K(&UY12Y-97-S
M86=E("L@(EQN(BD[#0H-"B @(" @(" @(" @(" @(" @(" @(" @(" @(" @
M(" @?0T*(" @(" @(" @(" @(" @(" @(" @(" @(" @('T-"B @(" @(" @
M(" @(" @(" @(" @(" @('T-"B @(" @(" @(" @(" @(" @(" -"B @(" @
M(" @(" @(" @("!]#0H@(" @(" @(" @(" @(" -"B @(" @(" @(" @('T-
M"B @(" @(" @(" @(&-A=&-H("A3>7-T96TN4G5N=&EM92Y);G1E<F]P4V5R
M=FEC97,N0T]-17AC97!T:6]N(&UY12D-"B @(" @(" @(" @('L-"B @(" @
M(" @(" @(" @("!#;VYS;VQE+E=R:71E3&EN92AM>44N1&%T82 K(")<;B(@
M*R!M>44N365S<V%G92 K(")<;B(I.PT*#0H@(" @(" @(" @("!]#0H-"B @
M(" @(" @(" @('1E>'0@/2!R97-U;'0[#0H@(" @(" @(" @(" O+W)E='5R
M;B!C;W5N=#L-"B @(')E='5R;CL-"@T*(" @(" @("!]#0H@(" @?0T*#0I]
"#0H`
`
end
.
- References:
- Ideas: How to rip text from PDF files
- From: Peter Qian
- Ideas: How to rip text from PDF files
- Prev by Date: Re: How to use interfaces properly...
- Next by Date: Using parameters in a SQL OleDb provider leads to "must declare variable error"
- Previous by thread: Ideas: How to rip text from PDF files
- Next by thread: Revert to v1.1 for an individual third party assembly
- Index(es):