unit FParse;
{ Copyright 1996 Kevin L. Boylan }

{
	TFParse will parse words (as defined by the developer or user) from a file, s string, or
	a PChar.  For performance purposes, all parsing is done in memory.  If you are parsing from
	a file and the file is > 64K, it will be buffered in 64K chunks.

	What is to be parsed is determined by setting one of the following three properties.
	Setting one	of these properties will override any previous settings of one of the three:

	FileToParse:			   The path and filename of a file that you wish to parsed.
	StringToParse:				A String variable that you wish to parse.
	PCharToParse:				A PChar variable that you wish to parse.

	What constitutes a word is defined through these properties:

	NormalCharacters: 		The set of characters that can go into a word.
	SignificantCharacters:	The set of characters that can go into a word only if surrounded
									by NormalCharacters.
	InsignificantCharacters:The set of characters that will be ignored and removed from between
									two NormalCharacters.
	MIN_WORD_LENGTH:			The minimum length of words that will be accepted.
	MAX_WORD_LENGTH:			The maximum length of words returned.  Longer words are truncated.
	CommonWords:				This list of words will be ingored, not returned. If you don't want
									to make use of this property, you can set the property CmnWrdsActive
									to False.

	Once these properties are set, you then repeatedly call GetNext, which returns a string
	containing the next word from the file, string, or PChar.  When GetNext returns a null
	string ('') then all words have been parsed.

	5/12/96			Released version 1.0
	5/15/96			Modified CreateCharSets, changing for loops to while loops because
						the counter was being modified inside the loops with inc(i,2).  This
						problem kept the component from installing under Delphi 2.0  This
						modification resulted in version 1.1.
}

interface

uses
	SysUtils, WinTypes, WinProcs, Messages, Classes, Graphics, Controls,
	Forms, Dialogs;

type
	Str20 = String[20];

	TFParse = class(TComponent)
		private
	  { Private declarations }
	  FFileToParse: String;
	  theFile: TFileStream;
	  ByteCount: LongInt;
	  TotalSize: LongInt;
	  FBuffer,CurrChar,EOBuffer,PlaceHolder: PChar;
	  BuffLen: Word;
	  NormChar,SigChar,InSigChar: Set of Char;
	  EOFile: Boolean;
	  FNormalCharacters: String;
	  FSignificantCharacters: String;
	  FInsignificantChars: String;
	  FCommonWords: TStrings;
	  FCmnWrdsActive: Boolean;
	  FMIN_WORD_LENGTH: Integer;
	  FMAX_WORD_LENGTH: Integer;
	protected
	  { Protected declarations }
	  procedure SetFileToParse( FName: String );
	  procedure SetStringToParse( theStr: String );
	  procedure SetPCharToParse( thePChar: PChar );
	  function  GetPercentDone: LongInt;
	  function  NextChar: Boolean;
	  procedure Loaded; override;
	  procedure CreateCharSets;
	  procedure SetCommonWords(Value: TStrings);
	  procedure Init;
	public
	  { Public declarations }
	  constructor Create( AOwner: TComponent ); override;
	  destructor Destroy; override;
	  property StringToParse: String write SetStringToParse;
	  property PCharToParse: PChar write SetPCharToParse;
	  function GetNext: String;
	  property PercentDone: LongInt read GetPercentDone;
	published
	  { Published declarations }
	  property FileToParse: String read FFileToParse write SetFileToParse;
	  property MIN_WORD_LENGTH: Integer read FMIN_WORD_LENGTH write FMIN_WORD_LENGTH default 1;
	  property MAX_WORD_LENGTH: Integer read FMAX_WORD_LENGTH write FMAX_WORD_LENGTH default 20;
	  property NormalCharacters: String read FNormalCharacters write FNormalCharacters;
	  property SignificantCharacters: String read FSignificantCharacters
	  												 						write FSignificantCharacters;
	  property InsignificantChars: String read FInsignificantChars write FInsignificantChars;
	  property CommonWords: TStrings read FCommonWords write SetCommonWords;
	  property CmnWrdsActive: Boolean read FCmnWrdsActive write FCmnWrdsActive;
	end;


procedure Register;
function MinLongInt( Long1, Long2: LongInt ): LongInt;

implementation

const
	MAX_WORD = 65526;


constructor TFParse.Create( AOwner: TComponent );
begin
	Inherited Create( AOwner );
	MIN_WORD_LENGTH := 1;		{ Default minimum word length }
	MAX_WORD_LENGTH := 20;      { Default maximum word length }
	FCmnWrdsActive := False;     { By default, CommonWords is not active }
	If (csDesigning In ComponentState) then
	 begin
		NormalCharacters := '0-9A-Za-z'; { Default chars that make words }
		SignificantCharacters := './';   { Default chars that can be inside words only }
		InsignificantChars := ',';       { Default chars that will be stripped from words }
	 end;
	 FCommonWords := TStringList.Create;
end;

destructor TFParse.Destroy;
begin
	theFile.Free;
	StrDispose( FBuffer );
	FCommonWords.Free;
	Inherited Destroy;
end;

procedure TFParse.Loaded;
begin
	Inherited Loaded;
	if not (csDesigning In ComponentState) then
		CreateCharSets;  { Create the Sets of Chars }
end;

procedure TFParse.SetCommonWords(Value: TStrings);
begin
	FCommonWords.Assign(Value);
end;


procedure TFParse.CreateCharSets;
{ Converts from the string representation of the character sets to real
	sets of char }
var
	i: Integer;
	j: Char;
begin
	{Create Normal Character Set }
	If NormalCharacters = '' then
		NormalCharacters := '0-9A-Za-z';  { If none set, then use the default }
	NormChar := [];
	i := 1;
	While i <= Length(NormalCharacters) do  { Read each character }
	 begin
		If (i < Length(NormalCharacters)-1) and (NormalCharacters[i+1] = '-') then
		 begin      { Handle ranges }
			For j := NormalCharacters[i] to NormalCharacters[i+2] do
				NormChar := NormChar + [j];    { Add to set }
			Inc(i,2);
		 end
		else
		 begin
			NormChar := NormChar + [NormalCharacters[i]];    { Add to set }
		 end;
		Inc(i,1);
	 end;
	{Create Significant Character Set }
	SigChar := [];
	i := 1;
	While i <= Length(SignificantCharacters) do
	 begin
		If SignificantCharacters[i+1] = '-' then
		 begin
			For j := SignificantCharacters[i] to SignificantCharacters[i+2] do
				SigChar := SigChar + [j];
			Inc(i,2);
		 end
		else
		 SigChar := SigChar + [SignificantCharacters[i]];
		Inc(i,1);
	 end;
	{Create InSignificant Character Set }
	InSigChar := [];
	i := 1;
	While i <= Length(InsignificantChars) do
	 begin
		If (i < Length(InsignificantChars)-1) and (InsignificantChars[i+1] = '-') and
			(InsignificantChars[i+2] <> '-') then
		 begin
			For j := InsignificantChars[i] to InsignificantChars[i+2] do
				InSigChar := InSigChar + [j];
			Inc(i,2);
		 end
		else
		 begin
			InSigChar := InSigChar + [InsignificantChars[i]];
			If (i < Length(InsignificantChars)-1) and (InsignificantChars[i+1] = '-') then
				Inc(i);
		 end;
		Inc(i,1);
	 end;
end;

procedure TFParse.Init;
{ Initializes variables prior to any parsing }
begin
	CurrChar := FBuffer-1; { Will start out incrementing CurrChar }
	ByteCount := 1;
	EOFile := False;
	EOBuffer := FBuffer + BuffLen;
	EOBuffer^ := Chr(0);
end;

procedure TFParse.SetFileToParse( FName: String );
{ Opens the file to be parsed.  Maximum block size to be read in at one time is
	MAX_WORD which should be close to 64K }
begin
	theFile.Free;

	theFile := TFileStream.Create( FName, fmOpenRead );
	If theFile.Size > MAX_WORD then
		BuffLen := MAX_WORD
	else
		BuffLen := theFile.Size+1;
	TotalSize := theFile.Size;
	StrDispose(FBuffer);
	FBuffer := StrAlloc(BuffLen);
	Init;
	theFile.Read( FBuffer^, BuffLen-1 );
end;

procedure TFParse.SetStringToParse( theStr: String );
{ Sets the buffer to be parsed from a String instead of from a file }
begin
	TotalSize := Length(theStr);
	BuffLen := TotalSize + 1;
	StrDispose(FBuffer);
	FBuffer := StrAlloc( BuffLen );
	StrPCopy( FBuffer, theStr );
	Init;
end;

procedure TFParse.SetPCharToParse( thePChar: PChar );
{ Sets the buffer to be parsed from a PChar instead of from a file }
begin
	TotalSize := StrLen( thePChar );
	BuffLen := TotalSize + 1;
	StrDispose(FBuffer);
	FBuffer := StrAlloc( BuffLen );
	StrCopy( FBuffer, thePChar );
	Init;
end;

function TFParse.GetNext: String;
{ returns the next word from the file }
var
	InSig: String;
	i: Integer;
	Loc: Byte;
begin
	Repeat
		Repeat
			InSig := '';
			Result := '';
			PlaceHolder := nil;
			While (NextChar) do
				If (CurrChar^ in NormChar) then
					Break;
			If EOFile then
			 begin
				StrDispose( FBuffer );
				FBuffer := nil;
				exit;
			 end;
			PlaceHolder := CurrChar;
			While (NextChar) do
			 begin
				If (CurrChar^ in InSigChar) then
				 begin
					InSig := InSig + CurrChar^;
					Continue;
				 end;
				If (CurrChar^ in NormChar) or
					((CurrChar^ in SigChar) and ((CurrChar+1)^ in NormChar)) then
					Continue
				else
					Break;
			 end;
			If EOFile then
			 begin
				StrDispose( FBuffer );
           FBuffer := nil;
				exit;
			 end;
			CurrChar^ := Chr(0);
		Until StrLen( PlaceHolder ) >= MIN_WORD_LENGTH;

		Result := Copy(LowerCase(StrPas( PlaceHolder )),1,MAX_WORD_LENGTH);
		If InSig <> '' then
			For i := 1 to Length(InSig) do
			 begin
				Loc := Pos(InSig[i], Result);
				Delete( Result, Loc, 1 );
			 end;
	Until ( (not CmnWrdsActive) or (FCommonWords.IndexOf( Result ) = -1) )
end;

function TFParse.NextChar: Boolean;
{ points at the next character in the buffer and reacts according to what's there }
var
	AmountToRead,
	AmountRead:		LongInt;
begin
	Result := True;
	Inc(CurrChar);
	Inc(ByteCount);
	If CurrChar < EOBuffer then
		exit;
	If ByteCount > TotalSize+1 then
	 begin
		EOFile := True;
		Result := False;
		exit;
	 end;
	If PlaceHolder <> nil then
	 begin
		{ Move word part that we are working on to beginning of buffer }
		StrCopy( FBuffer, PlaceHolder ); { Uses the null at end of FBuffer }
		CurrChar := StrEnd( FBuffer );   { Start reading at end of copied characters }
		PlaceHolder := FBuffer;          { Partial word has been moved to beginning of buffer }
		AmountToRead := BuffLen - 1 - StrLen( FBuffer );
	 end
	else
	 begin
		CurrChar := FBuffer;
		AmountToRead := BuffLen - 1;
		FBuffer[0] := Chr(0);
	 end;
	AmountRead := theFile.Read( (FBuffer+StrLen(FBuffer))^, AmountToRead );
	If AmountRead < AmountToRead then
	 begin
		EOBuffer := CurrChar + AmountRead;
		EOBuffer^ := Chr(0);
	 end;
end;

function TFParse.GetPercentDone: LongInt;
{ returns the percentage of the file that has been parsed }
begin
	Result := MinLongInt(((ByteCount * 100) div TotalSize), 100 );
end;

procedure Register;
{ Registers the component }
begin
	RegisterComponents('Samples', [TFParse]);
end;

function MinLongInt( Long1, Long2: LongInt ): LongInt;
begin
	If Long1 < Long2 then
		result := Long1
	Else
		result := Long2;
end;

end.
