2004. november 30., kedd

Downloading a URL’s HTML


Problem/Question/Abstract:

Downloading a URL’s HTML

Answer:

The objects I present in this article allow you to download data from any URL using the GET method, using only the standard socket components included with Delphi 4+. The object (TabHTTPRequest) is capable of connecting directly to a web server and then requesting a file, the object can also pass a query string; as of this writing it can only get a file using the GET method and using a query string. If there is sufficient interest I can expand the object to also handle POST and cookies, as well as interpreting the result so the return header can be used, so let me know what you guys think!

TInternetURI – This object takes a URI (uniform resource indicator) and splits it into it’s various components to allow the GET object to accept a URL such as http://www.borland.com/delphi/ as a parameter. You do not need to use this object directly.  This object however, follows the complete RFC standard for HTTP addresses and can be used to interpret any URL into its various components.

TabHTTPRequest – This object is designed to connect to a web server and download the HTML, which can then be used in your application.

A couple examples:

URL:

http://www.borland.com/delphi/

CODE:

with TabHTTPRequest.Create do
begin
  Get('http://www.borland.com/delphi/');
  // Work with result (ex. mmURL.Text := ResultData.DataString);
  Free;
end; // with

URL:

http://www.borland.com/rad/delandcppletter.html

CODE:

with TabHTTPRequest.Create do
begin
  Get('http://www.borland.com/rad/delandcppletter.html’);
    // Work with result (ex. mmURL.Text := ResultData.DataString);
    Free;
end; // with

URL: (This is an actual search on yahoo)

http://google.yahoo.com/bin/query?p=delphi+3000&hc=0&hs=0

CODE:

with TabHTTPRequest.Create do
begin
  Get('http://google.yahoo.com/bin/query?p=delphi+3000&hc=0&hs=0');
  // Work with result (ex. mmURL.Text := ResultData.DataString);
  Free;
end; // with

Once get has been called you can access the HTML through the ResultData property:

mmHTML.Lines.Text := URLObject.ResultData.DataString;

I hope you found this article and function to be useful; I’d love to hear your comments, suggestions, etc.

The following is the source code for the functions described above, feel free to use the code in your own programs, but please leave my name and address intact!

I also have a complete test program available by request via e-mail.

// ---------------------------ooo------------------------------ \\
// ©2000 David Lederman
// dlederman@ssccompany.com
// ---------------------------ooo------------------------------ \\
unit abHTTPGet;

interface

uses
  Classes, Sysutils, ScktComp;

// ---------------------------ooo------------------------------ \\
// This type will crack a Uniform Resource Indicator
// ---------------------------ooo------------------------------ \\
type
  TInternetURI = class(TObject)
  private
    function CrackScheme(var URIData: string): string;
    function CrackLocation(var URIData: string): string;
    function CrackQuery(var URIData: string): string;
    function CrackParams(var URIData: string): string;
  public
    Scheme: string;
    NetLocation: string;
    Path: string;
    Query: string;
    Fragment: string;
    Params: string;
    constructor Create(URIData: string);
    destructor Destroy; override;
  end;

type
  TabHTTPRequest = class
  private
    iBuffer: string;
    Socket: TClientSocket;
  public
    ResultData: TStringStream;
    HostToConnect: string;
    PortToConnect: Integer;
    FileToGet: string;
    TimeOut: Integer;
    function Get: Boolean; overload;
    function Get(URL: string): Boolean; overload;
    constructor Create;
    destructor Destroy; override;
  end;

  // ---------------------------ooo------------------------------ \\
  // Global HTTP Routines
  // ---------------------------ooo------------------------------ \\
function TrimToToken(Token: Char; var DataToParse: string; CopyToken: Boolean = True;
  MaxCount: Integer = 1): string;
function TrimPastToken(Token: Char; var DataToParse: string; CopyToken: Boolean =
  True; MaxCount: Integer = 1): string;

implementation

{ TabHTTPRequest }

constructor TabHTTPRequest.Create;
begin
  // Simply Set Defaults
  HostToConnect := 'www.InternetToolsCorp.com';
  PortToConnect := 80;
  FileToGet := '/';
  TimeOut := 5000;
  // Create the socket object
  Socket := TClientSocket.Create(nil);
  Socket.ClientType := ctBlocking;
  // Create the result stream
  ResultData := TStringStream.Create('');
end;

destructor TabHTTPRequest.Destroy;
begin
  // Free the helper objects
  Socket.Free;
  ResultData.Free;
  inherited;
end;

function TabHTTPRequest.Get: Boolean;
var
  Waiter: TWinSocketStream;
  BufferData: array[0..4028] of char;
  DataRead: Integer;
  BufferString: string;
begin
  // Setup the Request
  Waiter := nil;
  iBuffer := '';
  Socket.Host := HostToConnect;
  Socket.Port := PortToConnect;
  // Reset the data stream
  ResultData.Size := 0;
  try
    // Do the request
    // Open the connection
  //  Socket.Open;
    Socket.Open;
    // Create the waiter
    Waiter := TWinSocketStream.Create(Socket.Socket, TimeOut);
    // Prepare the request
    BufferString := 'GET ' + FileToGet + ' HTTP/1.1' + #13#10 + 'Host: ' +
      HostToConnect + #13#10 + #13#10;
    // Write the Request
    Waiter.Write(BufferString[1], Length(BufferString));
    Waiter.Free;
    Waiter := nil;
    // Now process the result of the request
    while Socket.Socket.Connected do
    begin
      try
        // Create the waiter
        Waiter := TWinSocketStream.Create(Socket.Socket, TimeOut);
        // Wait for data
        if Waiter.WaitForData(TimeOut) then
        begin
          // Try to read a chunck of data
          DataRead := Waiter.Read(BufferData, SizeOf(BufferData));
          // Check if we got data
          if DataRead = 0 then
          begin
            // Get out
            Socket.Close;
          end
          else
          begin
            // Save the data to the stream
            ResultData.Write(BufferData, DataRead);
          end;
        end
        else
        begin
          Socket.Close;
        end;
      finally
        Waiter.Free;
        Waiter := nil;
      end;
    end;
    // close the socket
    if Socket.Active then
      Socket.Close;
    Result := True;
    // Clean up
    if Waiter <> nil then
      Waiter.Free;
  except
    // Free the waiter object
    if Waiter <> nil then
      Waiter.Free;
    // Close the socket if it's open
    if Socket.Active then
      Socket.Close;
    // reraise the exception
    raise;
  end;
end;

function TabHTTPRequest.Get(URL: string): Boolean;
begin
  // Crack the URL
  try
    // Make sure than a scheme is in place
    if Pos('://', URL) = 0 then
    begin
      // Simply Prepend the HTTP
      URL := 'http://' + URL;
    end;
    // Make sure that a / is in the URL
    if Pos('/', Copy(URL, 8, Length(URL))) = 0 then
    begin
      // Simply Append the trailing /
      URL := URL + '/';
    end;

    with TInternetURI.Create(URL) do
    begin
      // Check if there is a port in the net location
      if Pos(':', NetLocation) <> 0 then
      begin
        // Copy the host name
        HostToConnect := Copy(NetLocation, 1, Pos(':', NetLocation) - 1);
        // Copy the port
        PortToConnect := StrToInt(Copy(NetLocation, Pos(':', NetLocation) + 1,
          Length(NetLocation)));
      end
      else
      begin
        HostToConnect := NetLocation;
        PortToConnect := 80;
      end;
      FileToGet := '';
      // Set the File to get
      if Query <> '' then
        FileToGet := Path + '?' + Query;
      if FileToGet = '' then
        FileToGet := '/';
      Free
    end; // with
    // Now simply call get
    Result := Get;
  except
    raise;
  end;
end;

{ TInternetURI }

function TInternetURI.CrackLocation(var URIData: string): string;
var
  StartPos, EndPos: Integer;
begin
  // Step 1. - See if the network ID is here
  StartPos := Pos('//', URIData);
  // If the starting // is not found then there is no network location
  if StartPos = 0 then
    Exit;
  // Delete the first //
  Delete(URIData, StartPos, 2);
  // Now look for the trailing slash
  EndPos := Pos('/', URIData);
  if (EndPos = 0) or (EndPos = 1) then
    Exit;
  // Now Copy the String Upto the /
  Result := Copy(URIData, 1, EndPos - 1);
  // Now Delete the network location
  Delete(URIData, 1, EndPos - 1);
end;

function TInternetURI.CrackParams(var URIData: string): string;
var
  StartPos: Integer;
begin
  // Step 1. - See if the query is here
  StartPos := Pos(';', URIData);
  // If the starting ; is not found then there are no params
  if StartPos = 0 then
    Exit;
  // Copy the Params String
  Result := Copy(URIData, StartPos + 1, Length(URIData));
  Delete(URIData, StartPos, Length(URIData));
end;

function TInternetURI.CrackQuery(var URIData: string): string;
var
  StartPos: Integer;
begin
  // Step 1. - See if the query is here
  StartPos := Pos('?', URIData);
  // If the starting ? is not found then there is no query
  if StartPos = 0 then
    Exit;
  // Copy the Query String
  Result := Copy(URIData, StartPos + 1, Length(URIData));
  Delete(URIData, StartPos, Length(URIData));
end;

function TInternetURI.CrackScheme(var URIData: string): string;
const
  AllowedChars = ['A'..'Z', 'a'..'z', '0'..'9', '+', '-', '.'];
var
  tString, WorkData: string;
  i: Integer;
  StringLength: Integer;
  InValidScheme: Boolean;
begin
  // Step 1. - Get To The First
  WorkData := TrimToToken(':', URIData, False);
  if WorkData = '' then
  begin
    Result := '';
    Exit;
  end;
  // Get The String Length
  StringLength := Length(WorkData);
  // See if any invalid characters are in the system
  InValidScheme := False;
  for i := 1 to StringLength do
  begin
    // Check if the char is valid
    InValidScheme := (WorkData[i] in AllowedChars) = False;
    if InValidScheme then
      Break;
  end;
  if InValidScheme then
  begin
    // we need to return the data back to the string
    URIData := WorkData + ':' + URIData;
  end
  else
  begin
    Result := WorkData;
  end;
end;

constructor TInternetURI.Create(URIData: string);
begin
  // Step 1. - Copy The Fragment
  Fragment := TrimPastToken('#', URIData, False);
  // Step 2. - Crack the Scheme
  Scheme := CrackScheme(URIData);
  // Step 3. - Crack the Network Location
  NetLocation := CrackLocation(URIData);
  // Step 4. - Crack the Query
  Query := CrackQuery(URIData);
  // Step 5. - Crack the Parameters
  Params := CrackParams(URIData);
  // Finally !! Copy the Path (which should be all that is remaining)
  Path := URIData;
end;

destructor TInternetURI.Destroy;
begin
  inherited;

end;

// ---------------------------ooo------------------------------ \\
// Global routines for HTTP Processing
// ---------------------------ooo------------------------------ \\
// ---------------------------ooo------------------------------ \\
// This function will take the DataToParse and create a string
// list seperating the data using the user-defined tokens.
// ---------------------------ooo------------------------------ \\

function TokenizeString(Tokens: TSysCharSet; DataToParse: string): TStringList;
var
  StringLength: Integer;
  i, CurPos, StartPos: Integer;
  tempString: string;
begin
  try
    // Create the result set
    Result := TStringList.Create;
    // Get The String Length
    StringLength := Length(DataToParse);
    // Setup the search
    CurPos := 1;
    StartPos := 1;
    // Look for the tokens
    for i := 1 to StringLength do
    begin
      // Increment the current position
      Inc(CurPos);
      // See if the char is in the token list
      if DataToParse[i] in Tokens then
      begin
        // copy the string to current
        tempString := Copy(DataToParse, StartPos, (CurPos - 1) - StartPos);
        Result.Add(tempstring);
        StartPos := i + 1;
      end;
    end;
    // Copy the final string (if neccesary)
    if (StartPos - 1) <> StringLength then
    begin
      tempString := Copy(DataToParse, StartPos, StringLength);
      Result.Add(tempString);
    end;
  except
    Result.Free;
    Result := nil;
  end;
end;

// ---------------------------ooo------------------------------ \\
// This function will use the TokenizeString routine to get
// all the occurences of Token then return all the string to
// the right of MaxCount occurences.
// ---------------------------ooo------------------------------ \\

function TrimToToken(Token: Char; var DataToParse: string; CopyToken: Boolean = True;
  MaxCount: Integer = 1): string;
var
  i: Integer;
begin
  // First Tokenize the string
  with TokenizeString([Token], DataToParse) do
  begin
    // Check if there were any occurences of Token
    if Count = 0 then
    begin
      // Return blank then free and exit
      Result := '';
      Free;
      Exit;
    end;
    // reset the final string
    DataToParse := '';
    for i := 0 to (MaxCount - 1) do
    begin
      // concat the string
      if CopyToken then
        Result := Result + Strings[i] + Token
      else
        Result := Result + Strings[i];
    end;
    // Copy and remaining data
    for i := (MaxCount) to Pred(Count) do
    begin
      DataToParse := DataToParse + Strings[i];
    end;
    Free;
  end;
end;

// ---------------------------ooo------------------------------ \\
// This function will use the TokenizeString routine to get
// all the occurences of Token then return all the string to
// the left of MaxCount occurences.
// ---------------------------ooo------------------------------ \\

function TrimPastToken(Token: Char; var DataToParse: string; CopyToken: Boolean =
  True; MaxCount: Integer = 1): string;
var
  i: Integer;
begin
  // First Tokenize the string
  with TokenizeString([Token], DataToParse) do
  begin
    // Check if there were any occurences of Token
    if Count = 0 then
    begin
      // Return blank then free and exit
      Result := '';
      Free;
      Exit;
    end;
    // reset the final string
    DataToParse := '';
    for i := 0 to (MaxCount - 1) do
    begin
      // concat the string
      DataToParse := DataToParse + Strings[i];
    end;
    // Copy and remaining data
    for i := (MaxCount) to Pred(Count) do
    begin
      if CopyToken then
        Result := Result + Token + Strings[i]
      else
        Result := Result + Strings[i];
    end;
    Free;
  end;
end;

end.

Nincsenek megjegyzések:

Megjegyzés küldése