2005. január 8., szombat

Downloading a URL’s HTML

Problem/Question/Abstract:

Downloading a URL’s HTML

Answer:

The objects I present in this article allow you to download data from any URL using the GET method, using only the standard socket components included with Delphi 4+. The object (TabHTTPRequest) is capable of connecting directly to a web server and then requesting a file, the object can also pass a query string; as of this writing it can only get a file using the GET method and using a query string. If there is sufficient interest I can expand the object to also handle POST and cookies, as well as interpreting the result so the return header can be used, so let me know what you guys think!

TInternetURI – This object takes a URI (uniform resource indicator) and splits it into it’s various components to allow the GET object to accept a URL such as http://www.borland.com/delphi/ as a parameter. You do not need to use this object directly.  This object however, follows the complete RFC standard for HTTP addresses and can be used to interpret any URL into its various components.

TabHTTPRequest – This object is designed to connect to a web server and download the HTML, which can then be used in your application.

A couple examples:

URL:

http://www.borland.com/delphi/

CODE:

with TabHTTPRequest.Create do
begin
Get('http://www.borland.com/delphi/');
// Work with result (ex. mmURL.Text := ResultData.DataString);
Free;
end; // with

URL:

http://www.borland.com/rad/delandcppletter.html

CODE:

with TabHTTPRequest.Create do
begin
Get('http://www.borland.com/rad/delandcppletter.html’);
// Work with result (ex. mmURL.Text := ResultData.DataString);
Free;
end; // with

URL: (This is an actual search on yahoo)

http://google.yahoo.com/bin/query?p=delphi+3000&hc=0&hs=0

CODE:

with TabHTTPRequest.Create do
begin
Get('http://google.yahoo.com/bin/query?p=delphi+3000&hc=0&hs=0');
// Work with result (ex. mmURL.Text := ResultData.DataString);
Free;
end; // with

Once get has been called you can access the HTML through the ResultData property:

mmHTML.Lines.Text := URLObject.ResultData.DataString;

I hope you found this article and function to be useful; I’d love to hear your comments, suggestions, etc.

The following is the source code for the functions described above, feel free to use the code in your own programs, but please leave my name and address intact!

I also have a complete test program available by request via e-mail.

// ---------------------------ooo------------------------------ \\
// ©2000 David Lederman
// dlederman@ssccompany.com
// ---------------------------ooo------------------------------ \\
unit abHTTPGet;

interface

uses
Classes, Sysutils, ScktComp;

// ---------------------------ooo------------------------------ \\
// This type will crack a Uniform Resource Indicator
// ---------------------------ooo------------------------------ \\
type
TInternetURI = class(TObject)
private
function CrackScheme(var URIData: string): string;
function CrackLocation(var URIData: string): string;
function CrackQuery(var URIData: string): string;
function CrackParams(var URIData: string): string;
public
Scheme: string;
NetLocation: string;
Path: string;
Query: string;
Fragment: string;
Params: string;
constructor Create(URIData: string);
destructor Destroy; override;
end;

type
TabHTTPRequest = class
private
iBuffer: string;
Socket: TClientSocket;
public
ResultData: TStringStream;
HostToConnect: string;
PortToConnect: Integer;
FileToGet: string;
TimeOut: Integer;
function Get: Boolean; overload;
function Get(URL: string): Boolean; overload;
constructor Create;
destructor Destroy; override;
end;

// ---------------------------ooo------------------------------ \\
// Global HTTP Routines
// ---------------------------ooo------------------------------ \\
function TrimToToken(Token: Char; var DataToParse: string; CopyToken: Boolean = True;
MaxCount: Integer = 1): string;
function TrimPastToken(Token: Char; var DataToParse: string; CopyToken: Boolean =
True; MaxCount: Integer = 1): string;

implementation

{ TabHTTPRequest }

constructor TabHTTPRequest.Create;
begin
// Simply Set Defaults
HostToConnect := 'www.InternetToolsCorp.com';
PortToConnect := 80;
FileToGet := '/';
TimeOut := 5000;
// Create the socket object
Socket := TClientSocket.Create(nil);
Socket.ClientType := ctBlocking;
// Create the result stream
ResultData := TStringStream.Create('');
end;

destructor TabHTTPRequest.Destroy;
begin
// Free the helper objects
Socket.Free;
ResultData.Free;
inherited;
end;

function TabHTTPRequest.Get: Boolean;
var
Waiter: TWinSocketStream;
BufferData: array[0..4028] of char;
DataRead: Integer;
BufferString: string;
begin
// Setup the Request
Waiter := nil;
iBuffer := '';
Socket.Host := HostToConnect;
Socket.Port := PortToConnect;
// Reset the data stream
ResultData.Size := 0;
try
// Do the request
// Open the connection
//  Socket.Open;
Socket.Open;
// Create the waiter
Waiter := TWinSocketStream.Create(Socket.Socket, TimeOut);
// Prepare the request
BufferString := 'GET ' + FileToGet + ' HTTP/1.1' + #13#10 + 'Host: ' +
HostToConnect + #13#10 + #13#10;
// Write the Request
Waiter.Write(BufferString[1], Length(BufferString));
Waiter.Free;
Waiter := nil;
// Now process the result of the request
while Socket.Socket.Connected do
begin
try
// Create the waiter
Waiter := TWinSocketStream.Create(Socket.Socket, TimeOut);
// Wait for data
if Waiter.WaitForData(TimeOut) then
begin
// Try to read a chunck of data
DataRead := Waiter.Read(BufferData, SizeOf(BufferData));
// Check if we got data
if DataRead = 0 then
begin
// Get out
Socket.Close;
end
else
begin
// Save the data to the stream
ResultData.Write(BufferData, DataRead);
end;
end
else
begin
Socket.Close;
end;
finally
Waiter.Free;
Waiter := nil;
end;
end;
// close the socket
if Socket.Active then
Socket.Close;
Result := True;
// Clean up
if Waiter <> nil then
Waiter.Free;
except
// Free the waiter object
if Waiter <> nil then
Waiter.Free;
// Close the socket if it's open
if Socket.Active then
Socket.Close;
// reraise the exception
raise;
end;
end;

function TabHTTPRequest.Get(URL: string): Boolean;
begin
// Crack the URL
try
// Make sure than a scheme is in place
if Pos('://', URL) = 0 then
begin
// Simply Prepend the HTTP
URL := 'http://' + URL;
end;
// Make sure that a / is in the URL
if Pos('/', Copy(URL, 8, Length(URL))) = 0 then
begin
// Simply Append the trailing /
URL := URL + '/';
end;

with TInternetURI.Create(URL) do
begin
// Check if there is a port in the net location
if Pos(':', NetLocation) <> 0 then
begin
// Copy the host name
HostToConnect := Copy(NetLocation, 1, Pos(':', NetLocation) - 1);
// Copy the port
PortToConnect := StrToInt(Copy(NetLocation, Pos(':', NetLocation) + 1,
Length(NetLocation)));
end
else
begin
HostToConnect := NetLocation;
PortToConnect := 80;
end;
FileToGet := '';
// Set the File to get
if Query <> '' then
FileToGet := Path + '?' + Query;
if FileToGet = '' then
FileToGet := '/';
Free
end; // with
// Now simply call get
Result := Get;
except
raise;
end;
end;

{ TInternetURI }

function TInternetURI.CrackLocation(var URIData: string): string;
var
StartPos, EndPos: Integer;
begin
// Step 1. - See if the network ID is here
StartPos := Pos('//', URIData);
// If the starting // is not found then there is no network location
if StartPos = 0 then
Exit;
// Delete the first //
Delete(URIData, StartPos, 2);
// Now look for the trailing slash
EndPos := Pos('/', URIData);
if (EndPos = 0) or (EndPos = 1) then
Exit;
// Now Copy the String Upto the /
Result := Copy(URIData, 1, EndPos - 1);
// Now Delete the network location
Delete(URIData, 1, EndPos - 1);
end;

function TInternetURI.CrackParams(var URIData: string): string;
var
StartPos: Integer;
begin
// Step 1. - See if the query is here
StartPos := Pos(';', URIData);
// If the starting ; is not found then there are no params
if StartPos = 0 then
Exit;
// Copy the Params String
Result := Copy(URIData, StartPos + 1, Length(URIData));
Delete(URIData, StartPos, Length(URIData));
end;

function TInternetURI.CrackQuery(var URIData: string): string;
var
StartPos: Integer;
begin
// Step 1. - See if the query is here
StartPos := Pos('?', URIData);
// If the starting ? is not found then there is no query
if StartPos = 0 then
Exit;
// Copy the Query String
Result := Copy(URIData, StartPos + 1, Length(URIData));
Delete(URIData, StartPos, Length(URIData));
end;

function TInternetURI.CrackScheme(var URIData: string): string;
const
AllowedChars = ['A'..'Z', 'a'..'z', '0'..'9', '+', '-', '.'];
var
tString, WorkData: string;
i: Integer;
StringLength: Integer;
InValidScheme: Boolean;
begin
// Step 1. - Get To The First
WorkData := TrimToToken(':', URIData, False);
if WorkData = '' then
begin
Result := '';
Exit;
end;
// Get The String Length
StringLength := Length(WorkData);
// See if any invalid characters are in the system
InValidScheme := False;
for i := 1 to StringLength do
begin
// Check if the char is valid
InValidScheme := (WorkData[i] in AllowedChars) = False;
if InValidScheme then
Break;
end;
if InValidScheme then
begin
// we need to return the data back to the string
URIData := WorkData + ':' + URIData;
end
else
begin
Result := WorkData;
end;
end;

constructor TInternetURI.Create(URIData: string);
begin
// Step 1. - Copy The Fragment
Fragment := TrimPastToken('#', URIData, False);
// Step 2. - Crack the Scheme
Scheme := CrackScheme(URIData);
// Step 3. - Crack the Network Location
NetLocation := CrackLocation(URIData);
// Step 4. - Crack the Query
Query := CrackQuery(URIData);
// Step 5. - Crack the Parameters
Params := CrackParams(URIData);
// Finally !! Copy the Path (which should be all that is remaining)
Path := URIData;
end;

destructor TInternetURI.Destroy;
begin
inherited;

end;

// ---------------------------ooo------------------------------ \\
// Global routines for HTTP Processing
// ---------------------------ooo------------------------------ \\
// ---------------------------ooo------------------------------ \\
// This function will take the DataToParse and create a string
// list seperating the data using the user-defined tokens.
// ---------------------------ooo------------------------------ \\

function TokenizeString(Tokens: TSysCharSet; DataToParse: string): TStringList;
var
StringLength: Integer;
i, CurPos, StartPos: Integer;
tempString: string;
begin
try
// Create the result set
Result := TStringList.Create;
// Get The String Length
StringLength := Length(DataToParse);
// Setup the search
CurPos := 1;
StartPos := 1;
// Look for the tokens
for i := 1 to StringLength do
begin
// Increment the current position
Inc(CurPos);
// See if the char is in the token list
if DataToParse[i] in Tokens then
begin
// copy the string to current
tempString := Copy(DataToParse, StartPos, (CurPos - 1) - StartPos);
Result.Add(tempstring);
StartPos := i + 1;
end;
end;
// Copy the final string (if neccesary)
if (StartPos - 1) <> StringLength then
begin
tempString := Copy(DataToParse, StartPos, StringLength);
Result.Add(tempString);
end;
except
Result.Free;
Result := nil;
end;
end;

// ---------------------------ooo------------------------------ \\
// This function will use the TokenizeString routine to get
// all the occurences of Token then return all the string to
// the right of MaxCount occurences.
// ---------------------------ooo------------------------------ \\

function TrimToToken(Token: Char; var DataToParse: string; CopyToken: Boolean = True;
MaxCount: Integer = 1): string;
var
i: Integer;
begin
// First Tokenize the string
with TokenizeString([Token], DataToParse) do
begin
// Check if there were any occurences of Token
if Count = 0 then
begin
// Return blank then free and exit
Result := '';
Free;
Exit;
end;
// reset the final string
DataToParse := '';
for i := 0 to (MaxCount - 1) do
begin
// concat the string
if CopyToken then
Result := Result + Strings[i] + Token
else
Result := Result + Strings[i];
end;
// Copy and remaining data
for i := (MaxCount) to Pred(Count) do
begin
DataToParse := DataToParse + Strings[i];
end;
Free;
end;
end;

// ---------------------------ooo------------------------------ \\
// This function will use the TokenizeString routine to get
// all the occurences of Token then return all the string to
// the left of MaxCount occurences.
// ---------------------------ooo------------------------------ \\

function TrimPastToken(Token: Char; var DataToParse: string; CopyToken: Boolean =
True; MaxCount: Integer = 1): string;
var
i: Integer;
begin
// First Tokenize the string
with TokenizeString([Token], DataToParse) do
begin
// Check if there were any occurences of Token
if Count = 0 then
begin
// Return blank then free and exit
Result := '';
Free;
Exit;
end;
// reset the final string
DataToParse := '';
for i := 0 to (MaxCount - 1) do
begin
// concat the string
DataToParse := DataToParse + Strings[i];
end;
// Copy and remaining data
for i := (MaxCount) to Pred(Count) do
begin
if CopyToken then
Result := Result + Token + Strings[i]
else
Result := Result + Strings[i];
end;
Free;
end;
end;

end.


Nincsenek megjegyzések:

Megjegyzés küldése