(out-of-band) performance with gcc3

Konstantin Popov kost at sics.se
Thu Feb 12 12:51:17 CET 2004


Denys,

thanks for the reminder - I apparently got confused this test with
another "foo.cc" I answered before.

First, I had to modify the source as follows because it did not
compile with gcc 2.95.3:

 ----- cut -----
void* program[1000];
int counter = 0;

static const unsigned int AGAIN = 1000000;

#define DISPATCH(N) goto ** (PC+=N)

void foo (bool init)
{
  void ** PC = program;
  unsigned int again = AGAIN;

  if (init) {
    for (int i=0;i<999;i++) program[i] = && DECR;
    program[999] = && HALT;
    return;
  } else {
    DISPATCH(0);
  DECR:
    counter += 1;
    DISPATCH(1);
  HALT:
    {
      again -= 1;
      if (again)
	{
	  PC=program;
	  DISPATCH(0);
	}
      return;
    }
  }
}

int main()
{
  foo(true);
  foo(false);
}
 ----- cut -----

which did not make any difference with gcc 3.3.2:

 ----- cut -----
kost (7) gcc --version
gcc (GCC) 3.3.2
Copyright (C) 2003 Free Software Foundation, Inc.
This is free software; see the source for copying conditions.  There is NO
warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.

kost (13) gcc -O3 foo.cc
kost (14) time ./a.out
3.064u 0.023s 0:03.15 97.7%     0+0k 0+0io 75pf+0w
 ----- cut -----

gcc 2.95.3 run actually slower (which does not make it very
pedagogical :-)):

 ----- cut -----
kost (19) gcc --version
2.95.3
kost (20) gcc -O3 foo.cc
kost (21) time ./a.out
5.382u 0.056s 0:05.57 97.4%     0+0k 0+0io 75pf+0w
 ----- cut -----

but for different reasons if you check the .s files.
The DISPATCH is still compiled better with old gcc.

 ----- cut -----
kost (22) gcc -O3 -S -fverbose-asm foo.cc
 ----- cut -----
	.file	"foo.cc"
# GNU C++ version 3.3.2 (i686-pc-linux-gnu)
#	compiled by GNU C version 3.3.2.
# GGC heuristics: --param ggc-min-expand=64 --param ggc-min-heapsize=64260
# options passed:  -D__GNUC__=3 -D__GNUC_MINOR__=3 -D__GNUC_PATCHLEVEL__=2
# -D_GNU_SOURCE -D__GNUG__=3 -auxbase -O3 -fverbose-asm
# options enabled:  -fdefer-pop -foptimize-sibling-calls -fcse-follow-jumps
# -fcse-skip-blocks -fexpensive-optimizations -fthread-jumps
# -fstrength-reduce -fpeephole -fforce-mem -ffunction-cse
# -fkeep-static-consts -fcaller-saves -fpcc-struct-return -fgcse -fgcse-lm
# -fgcse-sm -floop-optimize -fcrossjumping -fif-conversion -fif-conversion2
# -frerun-cse-after-loop -frerun-loop-opt -fdelete-null-pointer-checks
# -fschedule-insns2 -fsched-interblock -fsched-spec -fbranch-count-reg
# -fexceptions -freorder-blocks -freorder-functions -frename-registers
# -fcprop-registers -fcommon -fverbose-asm -fgnu-linker -fregmove
# -foptimize-register-move -fargument-alias -fstrict-aliasing
# -fmerge-constants -fzero-initialized-in-bss -fident -fpeephole2
# -fguess-branch-probability -fmath-errno -ftrapping-math -m80387
# -mhard-float -mno-soft-float -mieee-fp -mfp-ret-in-387
# -maccumulate-outgoing-args -mcpu=pentiumpro -march=i386

.globl program
	.bss
	.align 32
	.type	program, @object
	.size	program, 4000
program:
	.zero	4000
.globl counter
	.align 4
	.type	counter, @object
	.size	counter, 4
counter:
	.zero	4
	.text
	.align 2
	.p2align 4,,15
.globl _Z3foob
	.type	_Z3foob, @function
_Z3foob:
.LFB4:
	pushl	%ebp
.LCFI0:
	movl	$program, %edx	#  PC
	movl	%esp, %ebp
.LCFI1:
	cmpb	$0, 8(%ebp)	#  init
	movl	$1000000, %ecx	#  again
	pushl	%ebx
.LCFI2:
	je	.L2
	xorl	%eax, %eax
	.p2align 4,,15
.L8:
	movl	$.L7, %ecx
	movl	%ecx, program(,%eax,4)	#  program
	incl	%eax	#  i
	cmpl	$998, %eax	#  i
	jle	.L8
	movl	$.L9, %edx
	movl	%edx, program+3996	#  program
.L1:
	popl	%ebx
	popl	%ebp
	ret
.L2:
	movl	program, %eax
	movl	%eax, %ebx
	jmp	*%eax
.L7:
	incl	counter	#  counter
	addl	$4, %edx	#  PC
	movl	(%edx), %eax	# * PC
	jmp	*%eax
.L9:
	decl	%ecx	#  again
	je	.L1
	movl	$program, %edx	#  PC
	movl	%ebx, %eax
	jmp	*%eax
.LFE4:
	.size	_Z3foob, .-_Z3foob
	.align 2
	.p2align 4,,15
.globl main
	.type	main, @function
main:
.LFB5:
	pushl	%ebp
.LCFI3:
	movl	%esp, %ebp
.LCFI4:
	subl	$8, %esp
.LCFI5:
	andl	$-16, %esp
	movl	$1, (%esp)
	call	_Z3foob
	movl	$0, (%esp)
	call	_Z3foob
	movl	%ebp, %esp
	xorl	%eax, %eax
	popl	%ebp
	ret
.LFE5:
	.size	main, .-main
	.ident	"GCC: (GNU) 3.3.2"
 ----- cut -----

 ----- cut -----
gcc -O3 -S -fverbose-asm foo.cc
 ----- cut -----
	.file	"foo.cc"
	.version	"01.01"
# GNU C++ version 2.95.3 20010315 (release) (i686-pc-linux-gnu) compiled by GNU C version 2.95.3 20010315 (release).
# options passed:  -O3 -fverbose-asm
# options enabled:  -fdefer-pop -fcse-follow-jumps -fcse-skip-blocks
# -fexpensive-optimizations -fthread-jumps -fstrength-reduce -fpeephole
# -fforce-mem -ffunction-cse -finline-functions -finline
# -fkeep-static-consts -fcaller-saves -fpcc-struct-return -fgcse
# -frerun-cse-after-loop -frerun-loop-opt -fschedule-insns2 -fexceptions
# -fcommon -fverbose-asm -fgnu-linker -fregmove -foptimize-register-move
# -fargument-alias -fident -m80387 -mhard-float -mno-soft-float -mieee-fp
# -mfp-ret-in-387 -mschedule-prologue -mcpu=pentiumpro -march=pentium

gcc2_compiled.:
.globl counter
.data
	.align 4
	.type	 counter, at object
	.size	 counter,4
counter:
	.long 0
.text
	.align 4
.globl foo__Fb
	.type	 foo__Fb, at function
foo__Fb:
.LFB1:
	pushl %ebp
.LCFI0:
	movl %esp,%ebp
.LCFI1:
	movl $program,%eax
	movl %eax,%ecx
	cmpb $0,8(%ebp)
	je .L3
	movl $998,%ecx
	movl $program+3992,%eax
	.p2align 4,,7
.L7:
	movl $.L8,(%eax)
	addl $-4,%eax
	decl %ecx
	jns .L7
	movl $.L10,program+3996
	jmp .L2
	.p2align 4,,7
.L3:
	movl $999999,%edx
	jmp *program
	.p2align 4,,7
.L8:
	incl counter
	addl $4,%eax
	jmp *(%eax)
	.p2align 4,,7
.L10:
	testl %edx,%edx
	je .L2
	movl %ecx,%eax
	decl %edx
	jmp *program
	.p2align 4,,7
.L2:
	movl %ebp,%esp
	popl %ebp
	ret
.LFE1:
.Lfe1:
	.size	 foo__Fb,.Lfe1-foo__Fb
	.align 4
.globl main
	.type	 main, at function
main:
.LFB2:
	pushl %ebp
.LCFI2:
	movl %esp,%ebp
.LCFI3:
	subl $8,%esp
.LCFI4:
	addl $-12,%esp
	addl $-2,%esp
	pushw $1
.LCFI5:
	call foo__Fb
	addl $-12,%esp
	addl $-2,%esp
	pushw $0
	call foo__Fb
	xorl %eax,%eax
	movl %ebp,%esp
	popl %ebp
	ret
.LFE2:
.Lfe2:
	.size	 main,.Lfe2-main
.globl program
.bss
	.align 32
	.type	 program, at object
	.size	 program,4000
program:
	.zero	4000

.section	.eh_frame,"aw", at progbits
__FRAME_BEGIN__:
	.4byte	.LLCIE1
.LSCIE1:
	.4byte	0x0
	.byte	0x1
	.byte	0x0
	.byte	0x1
	.byte	0x7c
	.byte	0x8
	.byte	0xc
	.byte	0x4
	.byte	0x4
	.byte	0x88
	.byte	0x1
	.align 4
.LECIE1:
	.set	.LLCIE1,.LECIE1-.LSCIE1
	.4byte	.LLFDE1
.LSFDE1:
	.4byte	.LSFDE1-__FRAME_BEGIN__
	.4byte	.LFB1
	.4byte	.LFE1-.LFB1
	.byte	0x4
	.4byte	.LCFI0-.LFB1
	.byte	0xe
	.byte	0x8
	.byte	0x85
	.byte	0x2
	.byte	0x4
	.4byte	.LCFI1-.LCFI0
	.byte	0xd
	.byte	0x5
	.align 4
.LEFDE1:
	.set	.LLFDE1,.LEFDE1-.LSFDE1
	.4byte	.LLFDE3
.LSFDE3:
	.4byte	.LSFDE3-__FRAME_BEGIN__
	.4byte	.LFB2
	.4byte	.LFE2-.LFB2
	.byte	0x4
	.4byte	.LCFI2-.LFB2
	.byte	0xe
	.byte	0x8
	.byte	0x85
	.byte	0x2
	.byte	0x4
	.4byte	.LCFI3-.LCFI2
	.byte	0xd
	.byte	0x5
	.byte	0x4
	.4byte	.LCFI5-.LCFI3
	.byte	0x2e
	.byte	0x10
	.align 4
.LEFDE3:
	.set	.LLFDE3,.LEFDE3-.LSFDE3
	.ident	"GCC: (GNU) 2.95.3 20010315 (release)"
 ----- cut -----


> --=-=-=
> 
> Kostja,
> 
> could you perform some timings for the program attached below when
> compiled with gcc 2.95 and with 3.3.2?  Following a query on the gcc
> list, I am attempting to measure whether the difference in compilation
> have any measurable effect.
> 
> 
> --=-=-=
> Content-Disposition: attachment; filename=foo.cc
> Content-Description: jump stuff
> 
> void* program[1000];
> int counter = 0;
> 
> static const unsigned int AGAIN = 1000000;
> 
> #define DISPATCH(N) goto ** (PC+=N)
> 
> void foo (bool init)
> {
>   if (init) {
>     for (int i=0;i<999;i++) program[i] = && DECR;
>     program[999] = && HALT;
>     return;
>   } else {
>     void ** PC = program;
>     unsigned int again = AGAIN;
>     DISPATCH(0);
>   DECR:
>     counter += 1;
>     DISPATCH(1);
>   HALT:
>     {
>       again -= 1;
>       if (again)
> 	{
> 	  PC=program;
> 	  DISPATCH(0);
> 	}
>       return;
>     }
>   }
> }
> 
> int main()
> {
>   foo(true);
>   foo(false);
> }
> 
> --=-=-=
> Content-Type: text/plain; charset=iso-8859-1
> Content-Transfer-Encoding: 8bit
> 
> 
> -- 
> Denys Duchier - Équipe Calligramme - LORIA, Nancy, France
> 
> --=-=-=--
> -
> Please send submissions to hackers at mozart-oz.org
> and administriva mail to hackers-request at mozart-oz.org.
> The Mozart Oz web site is at http://www.mozart-oz.org/.

Cheers,

 --- Kostja.



More information about the mozart-hackers mailing list