blob: 6322617a16faf0a01e54c29483824bc9d2ce0007 [file] [log] [blame]
/*
* Anagram program by Raymond Chen,
* inspired by a similar program by Brian Scearce
*
* This program is Copyright 1991 by Raymond Chen.
* (rjc@math.princeton.edu)
*
* This program may be freely distributed provided all alterations
* to the original are clearly indicated as such.
*/
/* There are two tricks. First is the Basic Idea:
*
* When the user types in a phrase, the phrase is first preprocessed to
* determine how many of each letter appears. A bit field is then constructed
* dynamically, such that each field is large enough to hold the next power
* of two larger than the number of times the character appears. For example,
* if the phrase is hello, world, the bit field would be
*
* 00 00 00 000 000 00 00
* d e h l o r w
*
* The phrase hello, world, itself would be encoded as
*
* 01 01 01 011 010 01 01
* d e h l o r w
*
* and the word hollow would be encoded as
*
* 00 00 01 010 010 00 01
* d e h l o r w
*
* The top bit of each field is set in a special value called the sign.
* Here, the sign would be
*
* 10 10 10 100 100 10 10
* d e h l o r w
*
* The reason for packing the values into a bit field is that the operation
* of subtracting out the letters of a word from the current phrase can be
* carried out in parallel. for example, subtracting the word hello from
* the phrase hello, world, is merely
*
* d e h l o r w
* 01 01 01 011 010 01 01 (dehllloorw)
* - 00 00 01 010 010 00 01 (hlloow)
* ========================
* 01 01 00 001 000 01 00 (delr)
*
* Since none of the sign bits is set, the word fits, and we can continue.
* Suppose the next word we tried was hood.
*
* d e h l o r w
* 01 01 00 001 000 01 00 (delr)
* - 01 00 01 000 010 00 00 (hood)
* ========================
* 00 00 11 000 110 01 00
* ^ ^
* A sign bit is set. (Two, actually.) This means that hood does not
* fit in delr, so we skip it and try another word. (Observe that
* when a sign bit becomes set, it screws up the values for the letters to
* the left of that bit, but that's not important.)
*
* The inner loop of an anagram program is testing to see if a
* word fits in the collection of untried letters. Traditional methods
* keep an array of 26 integers, which are then compared in turn. This
* means that there are 26 comparisons per word.
*
* This method reduces the number of comparisons to MAX_QUAD, typically 2.
* Instead of looping through an array, we merely perform the indicated
* subtraction and test if any of the sign bits is set.
*/
/* The nuts and bolts:
*
* The dictionary is loaded and preprocessed. The preprocessed dictionary
* is a concatenation of copies of the structure:
*
* struct dictword {
* char bStructureSize; -- size of this structure
* char cLetters; -- number of letters in the word
* char achWord[]; -- the word itself (0-terminated)
* }
*
* Since this is a variable-sized structure, we keep its size in the structure
* itself for rapid stepping through the table.
*
* When a phrase is typed in, it is first preprocessed as described in the
* Basic Idea. We then go through the dictionary, testing each word. If
* the word fits in our phrase, we build the bit field for its frequency
* table and add it to the list of candidates.
*/
/*
* The Second Trick:
*
* Before diving into our anagram search, we then tabulate how many times
* each letter appears in our list of candidates, and sort the table, with
* the rarest letter first.
*
* We then do our anagram search.
*
* Like most anagram programs, this program does a depth-first search.
* Although most anagram programs do some sort of heuristics to decide what
* order to place words in the list_of_candidates, the search itself proceeds
* according to a greedy algorithm. That is, once you find a word that fits,
* subtract it and recurse.
*
* This anagram program exercises some restraint and does not march down
* every branch that shows itself. Instead, it only goes down branches
* that use the rarest unused letter. This helps to find dead ends faster.
*
* FindAnagram(unused_letters, list_of_candidates) {
* l = the rarest letter as yet unused
* For word in list_of_candidates {
* if word does not fit in unused_letters, go on to the next word.
* if word does not contain l, defer.
* FindAnagram(unused_letters - word, list_of_candidates[word,...])
* }
* }
*
*
* The heuristic of the Second Trick can probably be improved. I invite
* anyone willing to improve it to do so.
*/
/* Use the accompanying unproto perl script to remove the ANSI-style
* prototypes, for those of you who have K&R compilers.
*/
#include <stdio.h>
#include <stdlib.h>
#include <ctype.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <setjmp.h>
#include <strings.h>
#include <unistd.h>
/* Before compiling, make sure Quad and MASK_BITS are set properly. For best
* results, make Quad the largest integer size supported on your machine.
* So if your machine has long longs, make Quad an unsigned long long.
* (I called it Quad because on most machines, the largest integer size
* supported is a four-byte unsigned long.)
*
* If you need to be able to anagram larger phrases, increase MAX_QUADS.
* If you increase it beyond 4, you'll have to add a few more loop unrolling
* steps to FindAnagram.
*/
typedef unsigned long Quad; /* for building our bit mask */
#define MASK_BITS (sizeof(Quad)*8) /* number of bits in a Quad */
#define MAX_QUADS 2 /* controls largest phrase */
#define MAXWORDS 26000 /* dictionary length */
#define MAXCAND 5000 /* candidates */
#define MAXSOL 51 /* words in the solution */
#define ALPHABET 26 /* letters in the alphabet */
#define ch2i(ch) ((ch)-'a') /* convert letter to index */
#define i2ch(ch) ((ch)+'a') /* convert index to letter */
/* IBM PC's don't like globs of memory larger than 64K without
* special gyrations. That's where the huges get stuck in. And the
* two types of allocations on an IBM PC need to be handled differently.
*
* HaltProcessing is called during the anagram search. If it returns nonzero,
* then the search is aborted.
*
* Cdecl is a macro expanded before the name of all functions that must
* use C-style parameter passing. This lets you change the default parameter
* passing style for the other functions.
*/
/* char *malloc(); */
# define huge
# define far
# define StringFormat "%15s%c"
# define bigmalloc malloc
# define smallmalloc malloc
# define smallmallocfail (char *)0
# define HaltProcessing() 0 /* no easy way to interrupt on UNIX */
# define Cdecl
/* Code to be used only when debugging lives inside Debug().
* Code to be used only when collecting statistics lives inside Stat().
*/
#ifdef DEBUG
#define Debug(x) x
#else
#define Debug(x)
#endif
#ifdef STAT
#define Stat(x) x
#else
#define Stat(x)
#endif
/* A Word remembers the information about a candidate word. */
typedef struct {
Quad aqMask[MAX_QUADS]; /* the word's mask */
char * pchWord; /* the word itself */
unsigned cchLength; /* letters in the word */
} Word;
typedef Word * PWord;
typedef Word * * PPWord;
PWord apwCand[MAXCAND]; /* candidates we've found so far */
unsigned cpwCand; /* how many of them? */
/* A Letter remembers information about each letter in the phrase to be
* anagrammed.
*/
typedef struct {
unsigned uFrequency; /* how many times it appears */
unsigned uShift; /* how to mask */
unsigned uBits; /* the bit mask itself */
unsigned iq; /* which Quad to inspect? */
} Letter;
typedef Letter * PLetter;
Letter alPhrase[ALPHABET]; /* statistics on the current phrase */
#define lPhrase(ch) alPhrase[ch2i(ch)] /* quick access to a letter */
int cchPhraseLength; /* number of letters in phrase */
Quad aqMainMask[MAX_QUADS];/* the bit field for the full phrase */
Quad aqMainSign[MAX_QUADS];/* where the sign bits are */
int cchMinLength = 3;
/* auGlobalFrequency counts the number of times each letter appears, summed
* over all candidate words. This is used to decide which letter to attack
* first.
*/
unsigned auGlobalFrequency[ALPHABET];
char achByFrequency[ALPHABET]; /* for sorting */
char * pchDictionary; /* the dictionary is read here */
#define Zero(t) memset(t, 0, sizeof(t)) /* quickly zero out an integer array */
/* Fatal -- print a message before expiring */
void Fatal(char *pchMsg, unsigned u) {
fprintf(stderr, pchMsg, u);
exit(1);
}
/* ReadDict -- read the dictionary file into memory and preprocess it
*
* A word of length cch in the dictionary is encoded as follows:
*
* byte 0 = cch + 3
* byte 1 = number of letters in the word
* byte 2... = the word itself, null-terminated
*
* Observe that cch+3 is the length of the total encoding. These
* byte streams are concatenated, and terminated with a 0.
*/
void ReadDict(char *pchFile) {
FILE *fp;
char * pch;
char * pchBase;
unsigned long ulLen;
unsigned cWords = 0;
unsigned cLetters;
int ch;
struct stat statBuf;
if (stat(pchFile, &statBuf)) Fatal("Cannot stat dictionary\n", 0);
ulLen = statBuf.st_size + 2 * (unsigned long)MAXWORDS;
pchBase = pchDictionary = (char *)malloc(ulLen);
if(pchDictionary == NULL)
Fatal("Unable to allocate memory for dictionary\n", 0);
if ((fp = fopen(pchFile, "r")) == NULL)
Fatal("Cannot open dictionary\n", 0);
while (!feof(fp)) {
pch = pchBase+2; /* reserve for length */
cLetters = 0;
while ((ch = fgetc(fp)) != '\n' && ch != EOF) {
if (isalpha(ch)) cLetters++;
*pch++ = ch;
}
*pch++ = '\0';
*pchBase = pch - pchBase;
pchBase[1] = cLetters;
pchBase = pch;
cWords++;
}
fclose(fp);
*pchBase++ = 0;
fprintf(stderr, "main dictionary has %u entries\n", cWords);
if (cWords >= MAXWORDS)
Fatal("Dictionary too large; increase MAXWORDS\n", 0);
fprintf(stderr, "%lu bytes wasted\n", ulLen - (pchBase - pchDictionary));
}
void BuildMask(char * pchPhrase) {
int i;
int ch;
unsigned iq; /* which Quad? */
int cbtUsed; /* bits used in the current Quad */
int cbtNeed; /* bits needed for current letter */
Quad qNeed; /* used to build the mask */
bzero(alPhrase, sizeof(Letter)*ALPHABET);
bzero(aqMainMask, sizeof(Quad)*MAX_QUADS);
bzero(aqMainSign, sizeof(Quad)*MAX_QUADS);
/*
Zero(alPhrase);
Zero(aqMainMask);
Zero(aqMainSign);
*/
/* Tabulate letter frequencies in the phrase */
cchPhraseLength = 0;
while ((ch = *pchPhrase++) != '\0') {
if (isalpha(ch)) {
ch = tolower(ch);
lPhrase(ch).uFrequency++;
cchPhraseLength++;
}
}
/* Build masks */
iq = 0; /* which quad being used */
cbtUsed = 0; /* bits used so far */
for (i = 0; i < ALPHABET; i++) {
if (alPhrase[i].uFrequency == 0) {
auGlobalFrequency[i] = ~0; /* to make it sort last */
} else {
auGlobalFrequency[i] = 0;
for (cbtNeed = 1, qNeed = 1;
alPhrase[i].uFrequency >= qNeed;
cbtNeed++, qNeed <<= 1);
if (cbtUsed + cbtNeed > MASK_BITS) {
if (++iq >= MAX_QUADS)
Fatal("MAX_QUADS not large enough\n", 0);
cbtUsed = 0;
}
alPhrase[i].uBits = qNeed-1;
if (cbtUsed)
qNeed <<= cbtUsed;
aqMainSign[iq] |= qNeed;
aqMainMask[iq] |= (Quad)alPhrase[i].uFrequency << cbtUsed;
alPhrase[i].uShift = cbtUsed;
alPhrase[i].iq = iq;
cbtUsed += cbtNeed;
}
}
}
PWord
NewWord(void) {
PWord pw;
pw = (Word *)malloc(sizeof(Word));
if (pw == NULL)
Fatal("Out of memory after %d candidates\n", cpwCand);
return pw;
}
/* wprint -- print a word, followed by a space
*
* We would normally just use printf, but the string being printed is
* is a huge pointer (on an IBM PC), so special care must be taken.
*/
void wprint(char * pch) {
printf("%s ", pch);
}
PWord NextWord(void);
/* NextWord -- get another candidate entry, creating if necessary */
PWord NextWord(void) {
PWord pw;
if (cpwCand >= MAXCAND)
Fatal("Too many candidates\n", 0);
pw = apwCand[cpwCand++];
if (pw != NULL)
return pw;
apwCand[cpwCand-1] = NewWord();
return apwCand[cpwCand-1];
}
/* BuildWord -- build a Word structure from an ASCII word
* If the word does not fit, then do nothing.
*/
void BuildWord(char * pchWord) {
unsigned char cchFrequency[ALPHABET];
int i;
char * pch = pchWord;
PWord pw;
int cchLength = 0;
bzero(cchFrequency, sizeof(unsigned char)*ALPHABET);
/* Zero(cchFrequency); */
/* Build frequency table */
while ((i = *pch++) != '\0') {
if (!isalpha(i)) continue;
i = ch2i(tolower(i));
if (++cchFrequency[i] > alPhrase[i].uFrequency)
return;
++cchLength;
}
Debug(wprint(pchWord);)
/* Update global count */
for (i = 0; i < ALPHABET; i++)
auGlobalFrequency[i] += cchFrequency[i];
/* Create a Word structure and fill it in, including building the
* bitfield of frequencies.
*/
pw = NextWord();
bzero(pw->aqMask, sizeof(Quad)*MAX_QUADS);
/* Zero(pw->aqMask); */
pw->pchWord = pchWord;
pw->cchLength = cchLength;
for (i = 0; i < ALPHABET; i++) {
pw->aqMask[alPhrase[i].iq] |=
(Quad)cchFrequency[i] << alPhrase[i].uShift;
}
}
/* AddWords -- build the list of candidates */
void
AddWords(void) {
char * pch = pchDictionary; /* walk through the dictionary */
cpwCand = 0;
while (*pch) {
if ((pch[1] >= cchMinLength && pch[1]+cchMinLength <= cchPhraseLength)
|| pch[1] == cchPhraseLength)
BuildWord(pch+2);
pch += *pch;
}
fprintf(stderr, "%d candidates\n", cpwCand);
}
void DumpCandidates(void) {
unsigned u;
for (u = 0; u < cpwCand; u++)
printf(StringFormat, apwCand[u]->pchWord, (u % 4 == 3) ? '\n' : ' ');
printf("\n");
}
PWord apwSol[MAXSOL]; /* the answers */
int cpwLast;
Debug(
void DumpWord(Quad * pq) {
int i;
Quad q;
for (i = 0; i < ALPHABET; i++) {
if (alPhrase[i].uFrequency == 0) continue;
q = pq[alPhrase[i].iq];
if (alPhrase[i].uShift) q >>= alPhrase[i].uShift;
q &= alPhrase[i].uBits;
while (q--) putchar('a'+i);
}
putchar(' ');
}
) /* End of debug code */
void DumpWords(void) {
static int X;
int i;
X = (X+1) & 1023;
if (X != 0) return;
for (i = 0; i < cpwLast; i++) wprint(apwSol[i]->pchWord);
printf("\n");
}
Stat(unsigned long ulHighCount; unsigned long ulLowCount;)
jmp_buf jbAnagram;
#define OneStep(i) \
if ((aqNext[i] = pqMask[i] - pw->aqMask[i]) & aqMainSign[i]) { \
ppwStart++; \
continue; \
}
void
FindAnagram(Quad * pqMask, PPWord ppwStart, int iLetter)
{
Quad aqNext[MAX_QUADS];
register PWord pw;
Quad qMask;
unsigned iq;
PPWord ppwEnd = &apwCand[0];
ppwEnd += cpwCand;
;
if (HaltProcessing()) longjmp(jbAnagram, 1);
Debug(printf("Trying :"); DumpWord(pqMask); printf(":\n");)
for (;;) {
iq = alPhrase[achByFrequency[iLetter]].iq;
qMask = alPhrase[achByFrequency[iLetter]].uBits <<
alPhrase[achByFrequency[iLetter]].uShift;
if (pqMask[iq] & qMask) break;
iLetter++;
}
Debug(printf("Pivoting on %c\n", i2ch(achByFrequency[iLetter]));)
while (ppwStart < ppwEnd) { /* Half of the program execution */
pw = *ppwStart; /* time is spent in these three */
Stat(if (++ulLowCount == 0) ++ulHighCount;)
#if MAX_QUADS > 0
OneStep(0); /* lines of code. */
#endif
#if MAX_QUADS > 1
OneStep(1);
#endif
#if MAX_QUADS > 2
OneStep(2);
#endif
#if MAX_QUADS > 3
OneStep(3);
#endif
#if MAX_QUADS > 4
@@"Add more unrolling steps here, please."@@
#endif
/* If the pivot letter isn't present, defer this word until later */
if ((pw->aqMask[iq] & qMask) == 0) {
*ppwStart = *--ppwEnd;
*ppwEnd = pw;
continue;
}
/* If we get here, this means the word fits. */
apwSol[cpwLast++] = pw;
if (cchPhraseLength -= pw->cchLength) { /* recurse */
Debug(DumpWords();)
/* The recursive call scrambles the tail, so we have to be
* pessimistic.
*/
ppwEnd = &apwCand[0];
ppwEnd += cpwCand;
FindAnagram(&aqNext[0],
ppwStart, iLetter);
} else DumpWords(); /* found one */
cchPhraseLength += pw->cchLength;
--cpwLast;
ppwStart++;
continue;
}
;
}
int Cdecl CompareFrequency(char *pch1, char *pch2) {
if (auGlobalFrequency[*pch1] < auGlobalFrequency[*pch2])
return -1;
if (auGlobalFrequency[*pch1] > auGlobalFrequency[*pch2])
return 1;
if (*pch1 < *pch2)
return -1;
if (*pch1 > *pch2)
return 1;
return 0;
}
void SortCandidates(void) {
int i;
/* Sort the letters by frequency */
for (i = 0; i < ALPHABET; i++) achByFrequency[i] = i;
qsort(achByFrequency, ALPHABET, sizeof(char),
(int (*)(const void *, const void *))CompareFrequency);
fprintf(stderr, "Order of search will be ");
for (i = 0; i < ALPHABET; i++)
fputc(i2ch(achByFrequency[i]), stderr);
fputc('\n', stderr);
}
int fInteractive;
char * GetPhrase(char * pch, int size) {
if (fInteractive) printf(">");
fflush(stdout);
if (fgets(pch, size, stdin) == NULL) {
#ifdef PLUS_STATS
PrintDerefStats(stderr);
PrintHeapSize(stderr);
#endif /* PLUS_STATS */
exit(0);
}
return(pch);
}
char achPhrase[255];
int Cdecl main(int cpchArgc, char **ppchArgv) {
if (cpchArgc != 2 && cpchArgc != 3)
Fatal("Usage: anagram dictionary [length]\n", 0);
if (cpchArgc == 3)
cchMinLength = atoi(ppchArgv[2]);
fInteractive = isatty(1);
ReadDict(ppchArgv[1]);
while (GetPhrase(&achPhrase[0], sizeof(achPhrase)) != NULL) {
if (isdigit(achPhrase[0])) {
cchMinLength = atoi(achPhrase);
printf("New length: %d\n", cchMinLength);
} else if (achPhrase[0] == '?') {
DumpCandidates();
} else {
BuildMask(&achPhrase[0]);
AddWords();
if (cpwCand == 0 || cchPhraseLength == 0) continue;
Stat(ulHighCount = ulLowCount = 0;)
cpwLast = 0;
SortCandidates();
if (setjmp(jbAnagram) == 0)
FindAnagram(&aqMainMask[0], &apwCand[0], 0);
Stat(printf("%lu:%lu probes\n", ulHighCount, ulLowCount);)
}
}
return 0;
}