From: Ian Jackson <ijackson@chiark.greenend.org.uk>
Date: Thu, 25 Jul 2013 17:30:47 +0000 (+0100)
Subject: memcmp: Introduce and use consttime_memeq
X-Git-Tag: debian/0.3.0_beta2~36
X-Git-Url: https://git.distorted.org.uk/~mdw/secnet/commitdiff_plain/5ad34db2ccbbfbc936cd16121bed2110b70594ba?hp=5ad34db2ccbbfbc936cd16121bed2110b70594ba

memcmp: Introduce and use consttime_memeq

We need to use a constant-time memcmp in MAC checking, to avoid
leaking (to an adversary) how much of the MAC is right.  (This would
be especially dangerous if our MAC was outside the encryption, which
thankfully it isn't.)

The use of "volatile" on the accumulator prevents the compiler from
optimising away any of the updates to the accumulator, each of which
depends on all the bits of the two bytes being compared.  So that
stops the compiler shortcutting the computation.  This also prevents
the compiler spotting our boolean canonicalisation, and forces it to
do it the way we say.

This is true according to the spec by virtue of C99 6.7.3(6).

In an attempt to get the compiler to eliminate the pointless repeated
loading and storing of the single-byte accumulator value, I have
specified it as "register volatile".  There is no rule against
"register volatile", but my compiler ignores the "register".

To double check that all is well, here is an annotated disassembly,
from this command line:

  gcc -save-temps -DHAVE_CONFIG_H -I. -I. -Wall -Wwrite-strings -g -O2	\
      -Werror -W -Wno-unused -Wno-pointer-sign -Wstrict-prototypes	\
      -Wmissing-prototypes -Wmissing-declarations -Wnested-externs	\
      -Wredundant-decls -Wpointer-arith -Wformat=2 -Winit-self		\
      -Wswitch-enum -Wunused-variable -Wbad-function-cast		\
      -Wno-strict-aliasing -fno-strict-aliasing -c util.c -o util.o

This is the relevant part of util.s, as generated by gcc 4.4.5-8
i486-linux-gnu, with my annotations:

  .globl consttime_memeq
	  .type   consttime_memeq, @function
  consttime_memeq:
  .LFB80:
	  .loc 1 367 0
	  .cfi_startproc
  .LVL8:
	  pushl   %ebp
  .LCFI8:
	  .cfi_def_cfa_offset 8
	  movl    %esp, %ebp
	  .cfi_offset 5, -8
  .LCFI9:
	  .cfi_def_cfa_register 5
	  pushl   %edi
	  pushl   %esi
	  pushl   %ebx
	  subl    $16, %esp
	  .loc 1 367 0
	  movl    16(%ebp), %ebx	ebx : n
	  .cfi_offset 3, -20
	  .cfi_offset 6, -16
	  .cfi_offset 7, -12
	  movl    8(%ebp), %esi		esi : s1in
	  movl    12(%ebp), %edi	edi : s2in
	  .loc 1 369 0
	  movb    $0, -13(%ebp)		-13(ebp) : accumulator
  .LVL9:
	  .loc 1 371 0
	  testl   %ebx, %ebx		if (!n)
	  je      .L15			    goto no_bytes;
	  	  			i.e. if (n) { ...loop... }
  .LVL10:

 The compiler has chosen to invent an offset variable for controlling
 the loop, rather than pointer arithmetic.  We'll call that variable
 i.  It ranges from 0..n-1 as expected.  The compiler doesn't
 explictly compute the per-iteration pointers s1 and s2, instead using
 an addressing mode.

	  xorl    %edx, %edx		edx : i
	  .p2align 4,,7
	  .p2align 3
  .L16:					more_bytes: /* loop */
	  .loc 1 372 0
	  movzbl  (%edi,%edx), %eax	eax = *s2;
  .LVL11:
	  movzbl  -13(%ebp), %ecx	ecx = accumulator;
	  xorb    (%esi,%edx), %al	al = *s1 ^ *s2;
	  addl    $1, %edx     		i++;
	  orl     %ecx, %eax		eax = accumulator | (*s1^*s2)
  .LVL12:
	  .loc 1 371 0
	  cmpl    %edx, %ebx		[ if (i==n) ... ]
	  .loc 1 372 0
	  movb    %al, -13(%ebp)	accumulator = eax;
	  	       			i.e., overall,
					  accumulator |= *s1 ^ *s2
	  .loc 1 371 0
	  jne     .L16			... [if (i!=n)]
	  	  			    goto more_bytes;
  .LVL13:
  .L15:					no_bytes:
	  .loc 1 374 0			/* end of loop and if */

	       	     			/* now doing the shift-by-4: */
	  movzbl  -13(%ebp), %eax	eax = accumulator;
	  movzbl  -13(%ebp), %edx	edx = accumulator;
	  shrb    $4, %al    		al >>= 4;
  .LVL14:
	  orl     %edx, %eax		eax |= edx;
  .LVL15:
	  movb    %al, -13(%ebp)	accumulator = eax;
	  	       			i.e., overall,
					  accumulator |= accumulator >> 4;
	  .loc 1 375 0
	  movzbl  -13(%ebp), %eax	/* same again but for shift-by-2 */
	  movzbl  -13(%ebp), %edx
	  shrb    $2, %al
	  orl     %edx, %eax
  .LVL16:
	  movb    %al, -13(%ebp)
	  .loc 1 376 0
	  movzbl  -13(%ebp), %eax	/* same again but for shift-by-1 */
	  movzbl  -13(%ebp), %edx
	  shrb    %al
	  orl     %edx, %eax
  .LVL17:
	  movb    %al, -13(%ebp)	/* now computed final accumulator */
	  .loc 1 377 0
	  movzbl  -13(%ebp), %eax	eax = accumulator;
	  andl    $1, %eax   		eax &= 1;
  .LVL18:
	  movb    %al, -13(%ebp)	accumulator = eax;
	  .loc 1 378 0
	  movzbl  -13(%ebp), %eax	eax = accumulator;
	  xorl    $1, %eax   		eax ^= 1;
  .LVL19:
	  movb    %al, -13(%ebp)	accumulator = eax;
	  .loc 1 379 0
	  movzbl  -13(%ebp), %eax	eax = accumulator;
  .LVL20:
	  .loc 1 380 0
	  addl    $16, %esp		function epilogue
	  popl    %ebx 			    ...
  .LVL21:
	  popl    %esi 			    ...
  .LVL22:
	  popl    %edi 			    ...
  .LVL23:
	  .loc 1 379 0
	  movzbl  %al, %eax		sign extend?
	  .loc 1 380 0
	  popl    %ebp 			    ...
	  ret	  			return eax;
	  				 i.e. return accumulator;
	  .cfi_endproc
  .LFE80:
	  .size   consttime_memeq, .-consttime_memeq

Signed-off-by: Ian Jackson <ijackson@chiark.greenend.org.uk>
---