首页  编辑  

获取HTML页面的文本内容

Tags: /超级猛料/OS.操作系统/IE.扩展/页面控制和交互/   Date Created:

获取HTML页面的文本内容

// Very easy way to parse text from HTML using IHTMLDocument2.

uses

 mshtml, ActiveX, ComObj;

procedure TForm1.Button1Click(Sender: TObject);

var

 IDoc: IHTMLDocument2;

 Strl: TStringList;

 sHTMLFile: string;

 v: Variant;

 Links: IHTMLElementCollection;

 i: Integer;

 Link: IHTMLAnchorElement;

begin

 if OpenDialog1.Execute then

 begin

   sHTMLFile := OpenDialog1.FileName;

   Strl := TStringList.Create;

   try

     Strl.LoadFromFile(sHTMLFile);

     Idoc := CreateComObject(Class_HTMLDOcument) as IHTMLDocument2;

     try

       IDoc.designMode := 'on';

       while IDoc.readyState <> 'complete' do

         Application.ProcessMessages;

       v := VarArrayCreate([0, 0], VarVariant);

       v[0] := Strl.Text;

       IDoc.Write(PSafeArray(System.TVarData(v).VArray));

       IDoc.designMode := 'off';

       while IDoc.readyState <> 'complete' do

         Application.ProcessMessages;

       Memo1.Lines.Text := IDoc.body.innerText;

     finally

       IDoc := nil;

     end;

   finally

     Strl.Free;

   end;

 end;

end;