/* A demonstration of how to use Kirstein's zero crossing interval
   distributions to do a speaker dependent speech recognition of isolated
   words, using a PC and a Sound-Blaster compatible sound card.

   Though I compiled it using Turbo-C 2.0, it should take little, if any,
   adaptions to have a different compiler process it.

   Please read sbrecog.doc for further information.

   Johannes Kiehl, Trier (Germany), (c) 1993
*/

#define CPUSPEED 12
	/* insert here the cpu tact rate in MHz */

#include <stdio.h>
#include <stdlib.h>
#include <alloc.h>
#include <math.h>
#include <io.h>
#include <fcntl.h>
#include <string.h>

#include <direct.h> /* SB direct access */

#define TRUE 1
#define FALSE 0
#define ERROR(x) {printf("\nAn error occured: "); printf(x); printf("\n");}
#define MAXDICTIONARY 32
	/* the maxinum number of dictionary entries.
	   Must keep it <256, or some variables will overflow
	*/
#define PDELAY CPUSPEED*6
#define RDELAY CPUSPEED*2.3

typedef int boole;
typedef unsigned char byte;
typedef float soundvect[16];

unsigned zerotable[64];
soundvect dictionary[MAXDICTIONARY],parvect;
char *identifiers[MAXDICTIONARY];
int dictsize=0;
unsigned zerolength=0;

void play_sample(byte *snd,long size)
{       long i;
	byte *wp1;

	wp1=snd;
	speaker_on();
	for(wp1=snd,i=0;i<size;i++,wp1++){
		write_data(*wp1); asmdelay(PDELAY);
	}
	speaker_off();
}

void clip(byte *signal,unsigned size)
{	unsigned i;

	for (i=0;i<size;i++)
		if ((byte)signal[i]>128) (byte)signal[i]=255;
		else (byte)signal[i]=0;
}

void classify(unsigned length)
/* At a sampling rate of 11kHz, one byte represents 90 microseconds.
   Thus 64 bytes mean an interval of 5.8 ms, or a 86 Hz frequency.
   Fairly sufficient, according to Kristein, who set the lower
   margin of his own implementation at 79 Hz (6.3 ms intervals).
   This means that, for the given sampling rate, 64 classes instead
   of Kristein's 200 are enough
*/
{	zerotable[(length>64)?63:length-1]++;
}

void analyze(byte *signal,unsigned size)
{	unsigned i;

	clip(signal,size);
	for (i=1;i<size;i++) {
		zerolength++;
		if ((byte)signal[i]!=(byte)signal[i-1]) {
			classify(zerolength); zerolength=0;
		}
	}
}

byte limits[17]={0,1,2,3,4,5,6,7,8,10,12,15,19,25,34,48,64};

void addtovector(byte i,float a)
/*
 table  1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 11 | 13 | 16 | 20 | 26 | 35 | 49
          |   |   |   |   |   |   |   |-10|-12 |-15 |-19 |-25 |-34 |-48 |-64
 class  1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16
*/
{	byte j,size;

	j=i/4; while (i>limits[j]) j++;
	size=limits[j]-limits[j-1];
	parvect[j-1]+=(float)a/(float)size;
}

void analyze_table(void)
{	unsigned max=1;
	byte i;
	char j;
	int h;

	for(i=0;i<64;i++) if (zerotable[i]>max) max=zerotable[i];
	for(i=0;i<64;i++) addtovector(i+1,(float)zerotable[i]/(float)max);
}

void tidyup(void)
/* set the work space, i.e. parvect and zerotable, to all zeroes.
   Must be done *before* any and every call to analyze
*/
{	byte i;

	for(i=0;i<64;i++) {
		parvect[i/4]=0.0;
		zerotable[i]=0;
	}
}

boole harken(boole quiet)
{       byte threshold=2;
	byte *snd;
	int ctr,w;
	long i,size,maxsize=32768;
	boole result=FALSE;

	tidyup();
	if(reset_dsp()!=SBOK) {
		printf("\nError resetting Sound Blaster.\n"); exit(1);
	}
	snd=(byte*)malloc(maxsize); ctr=0;
	if (snd==(byte*)NULL) {
		ERROR("Memory allocation (fct harken)"); exit(0);
	}
	do {
		w=read_data(); asmdelay(RDELAY);
		if (abs(w-128)>threshold) ctr++;
		else ctr=0;
	} while (ctr<16);
	printf("Recording...");
	ctr=i=0;
	do {
		w=read_data(); asmdelay(RDELAY);
		if (abs(w-128)<threshold) ctr++;
		else ctr=0;
		snd[i++]=w;
	} while ((ctr<512) && (i<maxsize));
	printf(" Done (%u)\n",i);
	i-=1024;
	if (i>512) {
		result=TRUE;
                if (!quiet) play_sample(snd,i);
		analyze(snd+16,i);
		analyze_table();
	}
	free(snd);
	return(result);
}

void training(void)
{	char wk[64],wk2[8];
	boole done,inok;
	byte j,k=0;

	printf("You can now train up to %d words. You will be prompted to\n",
		MAXDICTIONARY);
	printf("  enter an ID string for each word, then pronounce it.\n");
	do {
		printf("\nPlease enter ID string #%d, or Q to quit training: ",
			dictsize);
		scanf("%s",wk);
		done=!strcmp(wk,"Q");
		if (!done) {
			if (identifiers[dictsize]==NULL)
				identifiers[dictsize]=malloc(strlen(wk)+1);
			strcpy(identifiers[dictsize],wk);
			do {
				inok=harken(FALSE);
				if (!inok) {
					printf("No word identified. ");
					printf("Try again (Y/N)? Y ");
					scanf("%s",wk2);
					done=!strcmp(wk2,"N");
					if (done) inok=TRUE;
				}
			} while (!inok);
			if (!done) {
				printf("Insert %s into dictionary (Y/N)? Y ",
					wk);
				scanf("%s",wk2);
				if (!!strcmp(wk2,"N")) {
					for(j=0;j<16;j++)
						dictionary[dictsize][j]=
							parvect[j];
					dictsize++;
				}
			}
		}
	} while (!done);
	printf("\nWould you like another training set of the same words to\n");
	printf("  be averaged with the set you just entered (Y/N)? N ");
	scanf("%s",wk);
	if (!!strcmp(wk,"N")) do {
		printf("\nPlease speak again #%d: %s ",k,identifiers[k]);
		do {
			inok=harken(FALSE);
			if (!inok) {
				printf("No word identified. ");
				printf("Please try again ");
			}
		} while (!inok);
		printf("Average %s into dictionary (Y/N)? Y ",wk);
		scanf("%s",wk2);
		if (!!strcmp(wk2,"N")) {
			for(j=0;j<16;j++) {
				dictionary[k][j]+=parvect[j];
				dictionary[k][j]/=2;
			}
			k++;
		}
	} while (k<dictsize);
}

float contingency(byte n)
/* compares the parameter vector (parvect) to the nth vector stored
   in the dictionary. Returns a 0<=value<=1 (the "information trans-
   mission rate" or "Transinformationswert") that serves as a measure
   for the similarity of the two vectors. 0.0 means very similar.

   Error status: -1.0 -- All matrix fields zero, cannot divide
*/
{	float s,t=0.0;
	byte i,j;
	float pij;
	float cmatrix[3][17];
	float result;

	/* copy parameter vectors into matrix, calculate line and
	   overall sums
	*/
	cmatrix[2][16]=0.0;
	for (i=0;i<2;i++) {
		cmatrix[i][16]=0.0;
		for (j=0;j<16;j++) {
			if (i==0) cmatrix[i][j]=parvect[j];
			else cmatrix[i][j]=dictionary[n][j];
			cmatrix[i][16]+=cmatrix[i][j];
		}
		cmatrix[2][16]+=cmatrix[i][16];
	}
	if (cmatrix[2][16]==0.0) result=-1.0;
	else {
		/* normalize matrix to overall sum=1.0 */
		for (i=0;i<2;i++)
			for (j=0;j<17;j++)
				cmatrix[i][j]/=cmatrix[2][16];
		cmatrix[2][16]=1.0;
		/* calculate column sums */
		for (j=0;j<16;j++) {
			cmatrix[2][j]=0.0;
			for (i=0;i<2;i++)
				cmatrix[2][j]+=cmatrix[i][j];
		}
		/* calculate rate of transmission */
		for (i=0;i<2;i++)
			for (j=0;j<16;j++) {
				s=cmatrix[i][16]*cmatrix[2][j];
				if (s>0.0) {
					pij=cmatrix[i][j];
					s=pij/s;
					if (s>0.0) t=t+pij*(log(s)/log(2));
				}
			}
		result=t;
	}
	return(result);
}

typedef struct
{
	byte first[256],second[256];
} matchstrc;

matchstrc *match(void)
/* Compares the actual parvect to each one stored in the dictionary;
   returns a pointer to the struct defined above. It contains two lists,
   each shorter than 256 elements: The first list contains the closest
   match or equally close matches. The second list contains elements
   only if one or more matches were found whose contingency value is
   "worse" by no more than 0.001.
   The end of each list is indicated by a zero value! To make this
   possible all the indices stored in the lists are >0; subtract 1
   before using them.
   Contingency values worse than 0.060 are not accepted as matches!

   Error status: NULL returned -- dictionary empty
*/
{       byte i,cf,cs;
	int rs,minval=2000;
	int rsarr[MAXDICTIONARY];
	matchstrc *result=NULL;

	for (i=0;i<dictsize;i++) {
		rs=(int)(contingency(i)*1000);
		if ((rs>-1000) && (rs<minval)) minval=rs;
		rsarr[i]=rs;
	}
	if (minval<60) {
		result=(matchstrc*)malloc(sizeof(matchstrc));
		if (result!=NULL) {
			cf=cs=0;
			for (i=0;i<dictsize;i++) {
				if (rsarr[i]==minval)
					result->first[cf++]=i+1;
				if (rsarr[i]==minval+1)
					result->second[cs++]=i+1;
			}
			result->first[cf]=0;
			result->second[cs]=0;
		}
	}
	return(result);
}

void ppmatches(matchstrc m)
{	byte cf=0,cs=0,i;

	while (m.first[cf]>0) cf++;
	while (m.second[cs]>0) cs++;
	if (cf==1) printf("%s matched best. ",identifiers[m.first[0]-1]);
	else {
		for (i=0;i<cf;i++) {
			printf("%s",identifiers[m.first[i]-1]);
			if (i<cf-2) printf(", ");
			else if (i<cf-1) printf(" or ");
			else printf(" match best. ");
		}
	}
	if (cs==1) printf("%s comes close",identifiers[m.second[0]-1]);
	else {
		for (i=0;i<cs;i++) {
			printf("%s",identifiers[m.second[i]-1]);
			if (i<cs-2) printf(", ");
			else if (i<cs-1) printf(" and ");
			else printf(" come close.");
		}
	}
	printf("\n");
}

void recogniser(void)
{	char wk[8];
	boole done,inok;
	byte w;
	matchstrc *matches;

	printf("\nNow you can speak words, the program will match them with\n");
	printf("  the trained dictionary and produce the identifiers of\n");
	printf("  the closest matches.\n");
	do {
		/*
		printf("\nRecognise a word (Y/N)? Y ");
		scanf("%s",wk);
		done=!strcmp(wk,"N");
		*/
		printf("\n"); done=FALSE;

		if (!done) {
			if (harken(TRUE)) {
				matches=match();
				if (matches!=NULL) {
					ppmatches(*matches);
					free(matches);
				}
				else printf("No matches.\n");
			}
			else printf("No word identified during recording\n");
		}
	} while (!done);
}

void initidents(void)
{	int i;

	for(i=0;i<MAXDICTIONARY;i++)
		identifiers[i]=NULL;
}

main()
{
	printf("SBRECOG speech recogniser demo\n");
	printf("(c) Johannes Kiehl, Trier 1993\n\n");
	initidents();
	training();
	recogniser();
}
