reduce-historical: Documentation

/*
prefix.c

Copyright (C) 2003-2006 Gil Dabah, http://ragestorm.net/distorm/
This library is licensed under the BSD license. See the file COPYING.
*/


#include "prefix.h"

#include "textdefs.h"
#include "x86defs.h"
#include "instructions.h"

// The main purpose of this module is to keep track of all kind of prefixes a single instruction may have.
// The problem is that a single instruction may have up to five different prefix-type.
// That's why I have to detect such cases and drop those excess prefixes.

int is_prefix(unsigned char ch, _DecodeType dt)
{
	switch (ch) {
		// for i in xrange(0x40, 0x50): print "case 0x%2x:" % i
		case 0x40: // REX:
		case 0x41:
		case 0x42:
		case 0x43:
		case 0x44:
		case 0x45:
		case 0x46:
		case 0x47:
		case 0x48:
		case 0x49:
		case 0x4a:
		case 0x4b:
		case 0x4c:
		case 0x4d:
		case 0x4e:
		case 0x4f: return (dt == Decode64Bits);
		case PREFIX_LOCK: return 1;
		case PREFIX_REPNZ: return 1;
		case PREFIX_REP: return 1;
		case PREFIX_CS: return 1;
		case PREFIX_SS: return 1;
		case PREFIX_DS: return 1;
		case PREFIX_ES: return 1;
		case PREFIX_FS: return 1;
		case PREFIX_GS: return 1;
		case PREFIX_OP_SIZE: return 1;
		case PREFIX_ADDR_SIZE: return 1;
	}
	return 0;
}

const unsigned char* PREFIX_MIN(const unsigned char* a, const unsigned char* b, const unsigned char* c, const unsigned char* d, const unsigned char* def)
{
	// Check for null.
	// Return smallest (=first good prefix to take into account).

	if (!a && !b && !c && !d) return def;
	if (!a) a = (const unsigned char*)~0; // MAX PTR ? :)
	if (!b) b = (const unsigned char*)~0;
	if (!c) c = (const unsigned char*)~0;
	if (!d) d = (const unsigned char*)~0;

	if (b < a) a = b;
	if (c < a) a = c;
	if (d < a) a = d;
	return a == (const unsigned char*)~0 ? def : a;
}

// Return the flag and type of given prefix.
void get_prefix_flag(unsigned char ch, _PrefixInfo* pi, _DecodeType dt)
{
	pi->flag = INST_FLAGS_NONE;
	pi->type = PRE_NONE;
	/*
	NOTE: AMD treat lock/rep as two different groups... But I am based on Intel.

	- Lock and Repeat:
			- 0xF0 � LOCK
			- 0xF2 � REPNE/REPNZ
			- 0xF3 - REP/REPE/REPZ
	- Segment Override:
			- 0x2E - CS
			- 0x36 - SS
			- 0x3E - DS
			- 0x26 - ES
			- 0x64 - FS
			- 0x65 - GS
		- Operand-Size Override: 0x66, switching default size.
		- Address-Size Override: 0x67, switching default size.

		64 Bits:
		- REX: 0x40 - 0x4f, extends register access.
	*/

	switch (ch)
	{
		// REX type, 64 bits decoding mode only:
		case 0x40:
		case 0x41:
		case 0x42:
		case 0x43:
		case 0x44:
		case 0x45:
		case 0x46:
		case 0x47:
		case 0x48:
		case 0x49:
		case 0x4a:
		case 0x4b:
		case 0x4c:
		case 0x4d:
		case 0x4e:
		case 0x4f:
			if (dt == Decode64Bits) {
				pi->flag = INST_PRE_REX;
				pi->type = PRE_REX;
			}
		break;

			// LOCK and REPx type:
		case PREFIX_LOCK: pi->flag = INST_PRE_LOCK; pi->type = PRE_LOKREP; break;
		case PREFIX_REPNZ: pi->flag = INST_PRE_REPNZ; pi->type = PRE_LOKREP; break;
		case PREFIX_REP: pi->flag = INST_PRE_REP; pi->type = PRE_LOKREP; break;

		// Seg Overide type:
		case PREFIX_CS: pi->flag = INST_PRE_CS; pi->type = PRE_SEGOVRD; break;
		case PREFIX_SS: pi->flag = INST_PRE_SS; pi->type = PRE_SEGOVRD; break;
		case PREFIX_DS: pi->flag = INST_PRE_DS; pi->type = PRE_SEGOVRD; break;
		case PREFIX_ES: pi->flag = INST_PRE_ES; pi->type = PRE_SEGOVRD; break;
		case PREFIX_FS: pi->flag = INST_PRE_FS; pi->type = PRE_SEGOVRD; break;
		case PREFIX_GS: pi->flag = INST_PRE_GS; pi->type = PRE_SEGOVRD; break;

		// Op Size type:
		case PREFIX_OP_SIZE: pi->flag = INST_PRE_OP_SIZE; pi->type = PRE_OPSIZE; break;

		// Addr Size type:
		case PREFIX_ADDR_SIZE: pi->flag = INST_PRE_ADDR_SIZE; pi->type = PRE_ADDRSIZE; break;
	}
}

void decode_prefixes(const unsigned char* code, long codeLen, _PrefixState* ps, _DecodeType dt)
{
	/* First thing to do, scan for prefixes, there are five types of prefixes.
	There may be up to five prefixes before a single instruction, not the same type, no special order,
	except REX must precede immediately the first opcode.
	BTW - This is the reason why I didn't make the REP prefixes part of the instructions (STOS/SCAS/etc).
	
	Another thing, the instruction maximum size is 15 bytes, thus if we read more than 15 bytes, we will halt.
	*/

	_PrefixInfo pi;
	ps->start = code;

	while ((--codeLen >= 0) || (code - ps->start >= INST_MAXIMUM_SIZE)) {
		// Examine what type of prefix we got.
		get_prefix_flag(*code, &pi, dt);
		if (pi.flag == INST_FLAGS_NONE) break; // Halt scanning.

		/*
		If we got something that is ALREADY included,
		we will have to skip that many bytes 'till we get past the first occurrence.
		Take a look: XYzABCUVz, the result would be: ABCUVz;  XYz are being dropped (we have to skip them),
		because then we would have z included twice, which is not allowed by 80x86.
		*/
		switch (pi.type)
		{
			case PRE_LOKREP:
				if ((ps->totalPrefixes & INST_PRE_LOKREP_MASK) != 0) { // Is it second time we got this same type prefix?
					// Remove all flags of this group, because we don't know which is the set one.
					ps->totalPrefixes &= ~INST_PRE_LOKREP_MASK;

					// Check whether we have to remove other types.
					// We remove other types if they appeared before the first repeating type.
					// We also have to update flags and positions.
					if (ps->segovrdPos && ps->segovrdPos < ps->lokrepPos) {
						// Update flags, remove any flag that is segment override concerned.
						ps->totalPrefixes &= ~INST_PRE_SEGOVRD_MASK;
						ps->segovrdPos = NULL;
					}
					if (ps->opsizePos && ps->opsizePos < ps->lokrepPos) {
						ps->totalPrefixes &= ~INST_PRE_OP_SIZE; // No need for mask, it's a single bit.
						ps->opsizePos = NULL;
					}
					if (ps->addrsizePos && ps->addrsizePos < ps->lokrepPos) {
						ps->totalPrefixes &= ~INST_PRE_ADDR_SIZE;
						ps->addrsizePos = NULL;
					}
					if (ps->rexpos && ps->rexpos < ps->lokrepPos) {
						ps->totalPrefixes &= ~INST_PRE_REX;
						ps->rexpos = NULL;
					}

					// Update current type position to last ^good^ position.
					// Notice we update the position ONLY after we removed dropped prefixes, so they're dropped relative to the old one.
					ps->lokrepPos = code;

					// start points to the first prefix we take into account (ignoring dropped prefixes).
					// Notice we do the assignment after the above if statements, because maybe they could affect the result.
					ps->start = PREFIX_MIN(ps->segovrdPos, ps->opsizePos, ps->addrsizePos, ps->rexpos, ps->lokrepPos);
				} else {
					// Update position to first occurence.
					ps->lokrepPos = code;
				}
				// Set flags anyways, if it's second time we cleaned the flags of this group already.
				ps->totalPrefixes |= pi.flag;
			break;
			case PRE_SEGOVRD:
				if ((ps->totalPrefixes & INST_PRE_SEGOVRD_MASK) != 0) {
					ps->totalPrefixes &= ~INST_PRE_SEGOVRD_MASK;

					if (ps->lokrepPos && ps->lokrepPos < ps->segovrdPos) {
						ps->totalPrefixes &= ~INST_PRE_LOKREP_MASK;
						ps->lokrepPos = NULL;
					}
					if (ps->opsizePos && ps->opsizePos < ps->segovrdPos) {
						ps->totalPrefixes &= ~INST_PRE_OP_SIZE;
						ps->opsizePos = NULL;
					}
					if (ps->addrsizePos && ps->addrsizePos < ps->segovrdPos) {
						ps->totalPrefixes &= ~INST_PRE_ADDR_SIZE;
						ps->addrsizePos = NULL;
					}
					if (ps->rexpos && ps->rexpos < ps->segovrdPos) {
						ps->totalPrefixes &= ~INST_PRE_REX;
						ps->rexpos = NULL;
					}
					ps->segovrdPos = code;
					ps->start = PREFIX_MIN(ps->lokrepPos, ps->opsizePos, ps->addrsizePos, ps->rexpos, ps->segovrdPos);
				} else {
					ps->segovrdPos = code;
				}
				ps->totalPrefixes |= pi.flag;
			break;
			case PRE_OPSIZE:
				if (ps->totalPrefixes & pi.flag) {
					if (ps->lokrepPos && ps->lokrepPos < ps->opsizePos) {
						ps->totalPrefixes &= ~INST_PRE_LOKREP_MASK;
						ps->lokrepPos =	NULL;
					}
					if (ps->segovrdPos && ps->segovrdPos < ps->opsizePos) {
						ps->totalPrefixes &= ~INST_PRE_SEGOVRD_MASK;
						ps->segovrdPos = NULL;
					}
					if (ps->addrsizePos && ps->addrsizePos < ps->opsizePos) {
						ps->totalPrefixes &= ~INST_PRE_ADDR_SIZE;
						ps->addrsizePos = NULL;
					}
					if (ps->rexpos && ps->rexpos < ps->opsizePos) {
						ps->totalPrefixes &= ~INST_PRE_REX;
						ps->rexpos = NULL;
					}
					ps->opsizePos = code;
					ps->start = PREFIX_MIN(ps->lokrepPos, ps->segovrdPos, ps->addrsizePos, ps->rexpos, ps->opsizePos);
				} else {
					ps->totalPrefixes |= pi.flag;
					ps->opsizePos = code;
				}
			break;
			case PRE_ADDRSIZE:
				if (ps->totalPrefixes & pi.flag) {
					if (ps->lokrepPos && ps->lokrepPos < ps->addrsizePos) {
						ps->totalPrefixes &= ~INST_PRE_LOKREP_MASK;
						ps->lokrepPos = NULL;
					}
					if (ps->segovrdPos && ps->segovrdPos < ps->addrsizePos) {
						ps->totalPrefixes &= ~INST_PRE_SEGOVRD_MASK;
						ps->segovrdPos = NULL;
					}
					if (ps->opsizePos && ps->opsizePos < ps->addrsizePos) {
						ps->totalPrefixes &= ~INST_PRE_OP_SIZE;
						ps->opsizePos = NULL;
					}
					if (ps->rexpos && ps->rexpos < ps->addrsizePos) {
						ps->totalPrefixes &= ~INST_PRE_REX;
						ps->rexpos = NULL;
					}
					ps->addrsizePos = code;
					ps->start = PREFIX_MIN(ps->lokrepPos, ps->segovrdPos, ps->opsizePos, ps->rexpos, ps->addrsizePos);
				} else {
					ps->totalPrefixes |= pi.flag;
					ps->addrsizePos = code;
				}
			break;
			case PRE_REX:
				if (ps->totalPrefixes & pi.flag) {
					if (ps->lokrepPos && ps->lokrepPos < ps->rexpos) {
						ps->totalPrefixes &= ~INST_PRE_LOKREP_MASK;
						ps->lokrepPos = NULL;
					}
					if (ps->segovrdPos && ps->segovrdPos < ps->rexpos) {
						ps->totalPrefixes &= ~INST_PRE_SEGOVRD_MASK;
						ps->segovrdPos = NULL;
					}
					if (ps->opsizePos && ps->opsizePos < ps->rexpos) {
						ps->totalPrefixes &= ~INST_PRE_OP_SIZE;
						ps->opsizePos = NULL;
					}
					if (ps->addrsizePos && ps->addrsizePos < ps->rexpos) {
						ps->totalPrefixes &= ~INST_PRE_ADDR_SIZE;
						ps->addrsizePos = NULL;
					}
					ps->rexpos = code;
					ps->start = PREFIX_MIN(ps->lokrepPos, ps->segovrdPos, ps->opsizePos, ps->addrsizePos, ps->rexpos);
				} else {
					ps->totalPrefixes |= pi.flag;
					ps->rexpos = code;
				}
				break;
		}
		code++;
	}
	// Save last byte scanned, so the decoder could keep on scanning from this point and on and on and on.
	// In addition the decoder is able to know that the last byte could lead to MMX/SSE instructions (preceding REX if exists).
	ps->last = code;
}

/*
This function gets the prefix state of the last instruction and the total prefixes
of that instruction which we got from the code itself,
then it DB's everything which wasn't used, by looking at the instruction's prefixes we GOT
and the USED prefixes (which is returned by the extract_operand).
Finally, it returns a string of the output (unused prefixes).

Note: This function should get the valid prefixes which weren't dropped.
			The Decode function deals with the dropped ones by itself.

Here is a small sample for showing a case where there are some prefixes which weren't used, and should be DB'ed.

16 bits decoding mode:
40 ~ INC AX
66 40 ~ INC EAX
66 2E 40 ~ DB 2E; INC EAX
67 2E ~ DB 67; INC AX

The whole thing is working by using the usedPrefixes, this variable gets updated
whenever the decoder mechanism is affected by any one of the totalPrefixes prefixes.
There are a few spots in the code along some of the decoder functions which are responsible for updating the usedPrefixes.

When calling this function, you assume totalPrefixes contains all non-dropped prefixes for that instruction,
and also the usedPrefixes is filled already.

*/

void get_unused_prefixes_list(unsigned char unusedList[MAX_PREFIXES], _PrefixState* ps, _DecodeType dt)
{
	// We might have 5 unused prefixes at most, up to 5 prefixes for one instruction.
	const unsigned char* ptrs[MAX_PREFIXES] = {0};

	unsigned int i, j;
	const unsigned char* tmp = NULL;

	memset(unusedList, 0, MAX_PREFIXES);

	// We have to restore at this point flags that were disabled manually, because they are ignored in 64 bits.
	// Therefore, we will have to check whether they were set before we disabled them and now reenable them in order
	// to output them as unused prefixes.

	// Check out whether the REX prefix was ignored... and reenable it, so we can see if it were used or not.
	if (ps->rexpos != NULL) {
		ps->totalPrefixes |= INST_PRE_REX;
		// Reenable operand size prefix, so it will be dropped as unused.
		if ((ps->opsizePos != NULL) && (*ps->rexpos & PREFIX_REX_W)) ps->totalPrefixes |= INST_PRE_OP_SIZE;
	}
	// In 64 bits, it could be that we disabled CS, ES, DS, SS.
	if ((dt == Decode64Bits) && (ps->segovrdPos != NULL) && ((ps->totalPrefixes & (INST_PRE_GS | INST_PRE_FS)) == 0)) {
		// Only if it weren't a valid 64 bits segment override prefix, we will have to reenable it manually.
		// It could be anyone of the invalid segments...
		ps->totalPrefixes |= INST_PRE_CS;
	}

	// Remove all used prefixes.
	ps->totalPrefixes &= ~ps->usedPrefixes;

	// Caller function depends on this value, so it will know how many unused prefixes there are.
	ps->unusedCount = 0;

	// All are used? Cool then.
	if (ps->totalPrefixes == ps->usedPrefixes) return ;

	// Determine what types of prefixes were unused.
	// Every type has only one pointer to that prefix.
	if (ps->totalPrefixes & INST_PRE_OP_SIZE) ptrs[ps->unusedCount++] = ps->opsizePos;
	if (ps->totalPrefixes & INST_PRE_ADDR_SIZE) ptrs[ps->unusedCount++] = ps->addrsizePos;
	if (ps->totalPrefixes & INST_PRE_LOKREP_MASK) ptrs[ps->unusedCount++] = ps->lokrepPos;
	if (ps->totalPrefixes & INST_PRE_SEGOVRD_MASK) ptrs[ps->unusedCount++] = ps->segovrdPos;
	if (ps->totalPrefixes & INST_PRE_REX) ptrs[ps->unusedCount++] = ps->rexpos;

	// Sort them, so you output them by their real order.
	// Bubble yak is good enough.
	for (i = 0; i < ps->unusedCount; i++) {
		for (j = 0; j < ps->unusedCount; j++) {
			if (ptrs[j] > ptrs[i]) {
				tmp = ptrs[j];
				ptrs[j] = ptrs[i];
				ptrs[i] = tmp;
			}
		}
	}

	// Get values and store in the given array.
	for (i = 0; i < ps->unusedCount; i++)
		unusedList[i] = *ptrs[i];
}


// Concatenates a string of the used segment by examining the prefixes.
void str_seg_text(_WString* s, _PrefixState* ps, _DecodeType dt)
{
	_iflags flags = ps->totalPrefixes & INST_PRE_SEGOVRD_MASK;
	// Segment Override prefixes are ignored in 64 bits.
	if (flags == 0) return;

	// 64 bits mode ignoers some prefixes.
	if (dt != Decode64Bits) {
		switch (flags)
		{
			case INST_PRE_CS:
				ps->usedPrefixes |= INST_PRE_CS;
				strcat_WSN(s, PREFIX_CS_TEXT);
			return;
			case INST_PRE_SS:
				ps->usedPrefixes |= INST_PRE_SS;
				strcat_WSN(s, PREFIX_SS_TEXT);
			return;
			case INST_PRE_DS:
				ps->usedPrefixes |= INST_PRE_DS;
				strcat_WSN(s, PREFIX_DS_TEXT);
			return;
			case INST_PRE_ES:
				ps->usedPrefixes |= INST_PRE_ES;
				strcat_WSN(s, PREFIX_ES_TEXT);
			return;
		}
	}
	switch (flags)
	{
		case INST_PRE_FS:
			ps->usedPrefixes |= INST_PRE_FS;
			strcat_WSN(s, PREFIX_FS_TEXT);
		return;
		case INST_PRE_GS:
			ps->usedPrefixes |= INST_PRE_GS;
			strcat_WSN(s, PREFIX_GS_TEXT);
		return;
	}
}