(out-of-band) performance with gcc3
Konstantin Popov
kost at sics.se
Thu Feb 12 12:51:17 CET 2004
Denys,
thanks for the reminder - I apparently got confused this test with
another "foo.cc" I answered before.
First, I had to modify the source as follows because it did not
compile with gcc 2.95.3:
----- cut -----
void* program[1000];
int counter = 0;
static const unsigned int AGAIN = 1000000;
#define DISPATCH(N) goto ** (PC+=N)
void foo (bool init)
{
void ** PC = program;
unsigned int again = AGAIN;
if (init) {
for (int i=0;i<999;i++) program[i] = && DECR;
program[999] = && HALT;
return;
} else {
DISPATCH(0);
DECR:
counter += 1;
DISPATCH(1);
HALT:
{
again -= 1;
if (again)
{
PC=program;
DISPATCH(0);
}
return;
}
}
}
int main()
{
foo(true);
foo(false);
}
----- cut -----
which did not make any difference with gcc 3.3.2:
----- cut -----
kost (7) gcc --version
gcc (GCC) 3.3.2
Copyright (C) 2003 Free Software Foundation, Inc.
This is free software; see the source for copying conditions. There is NO
warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
kost (13) gcc -O3 foo.cc
kost (14) time ./a.out
3.064u 0.023s 0:03.15 97.7% 0+0k 0+0io 75pf+0w
----- cut -----
gcc 2.95.3 run actually slower (which does not make it very
pedagogical :-)):
----- cut -----
kost (19) gcc --version
2.95.3
kost (20) gcc -O3 foo.cc
kost (21) time ./a.out
5.382u 0.056s 0:05.57 97.4% 0+0k 0+0io 75pf+0w
----- cut -----
but for different reasons if you check the .s files.
The DISPATCH is still compiled better with old gcc.
----- cut -----
kost (22) gcc -O3 -S -fverbose-asm foo.cc
----- cut -----
.file "foo.cc"
# GNU C++ version 3.3.2 (i686-pc-linux-gnu)
# compiled by GNU C version 3.3.2.
# GGC heuristics: --param ggc-min-expand=64 --param ggc-min-heapsize=64260
# options passed: -D__GNUC__=3 -D__GNUC_MINOR__=3 -D__GNUC_PATCHLEVEL__=2
# -D_GNU_SOURCE -D__GNUG__=3 -auxbase -O3 -fverbose-asm
# options enabled: -fdefer-pop -foptimize-sibling-calls -fcse-follow-jumps
# -fcse-skip-blocks -fexpensive-optimizations -fthread-jumps
# -fstrength-reduce -fpeephole -fforce-mem -ffunction-cse
# -fkeep-static-consts -fcaller-saves -fpcc-struct-return -fgcse -fgcse-lm
# -fgcse-sm -floop-optimize -fcrossjumping -fif-conversion -fif-conversion2
# -frerun-cse-after-loop -frerun-loop-opt -fdelete-null-pointer-checks
# -fschedule-insns2 -fsched-interblock -fsched-spec -fbranch-count-reg
# -fexceptions -freorder-blocks -freorder-functions -frename-registers
# -fcprop-registers -fcommon -fverbose-asm -fgnu-linker -fregmove
# -foptimize-register-move -fargument-alias -fstrict-aliasing
# -fmerge-constants -fzero-initialized-in-bss -fident -fpeephole2
# -fguess-branch-probability -fmath-errno -ftrapping-math -m80387
# -mhard-float -mno-soft-float -mieee-fp -mfp-ret-in-387
# -maccumulate-outgoing-args -mcpu=pentiumpro -march=i386
.globl program
.bss
.align 32
.type program, @object
.size program, 4000
program:
.zero 4000
.globl counter
.align 4
.type counter, @object
.size counter, 4
counter:
.zero 4
.text
.align 2
.p2align 4,,15
.globl _Z3foob
.type _Z3foob, @function
_Z3foob:
.LFB4:
pushl %ebp
.LCFI0:
movl $program, %edx # PC
movl %esp, %ebp
.LCFI1:
cmpb $0, 8(%ebp) # init
movl $1000000, %ecx # again
pushl %ebx
.LCFI2:
je .L2
xorl %eax, %eax
.p2align 4,,15
.L8:
movl $.L7, %ecx
movl %ecx, program(,%eax,4) # program
incl %eax # i
cmpl $998, %eax # i
jle .L8
movl $.L9, %edx
movl %edx, program+3996 # program
.L1:
popl %ebx
popl %ebp
ret
.L2:
movl program, %eax
movl %eax, %ebx
jmp *%eax
.L7:
incl counter # counter
addl $4, %edx # PC
movl (%edx), %eax # * PC
jmp *%eax
.L9:
decl %ecx # again
je .L1
movl $program, %edx # PC
movl %ebx, %eax
jmp *%eax
.LFE4:
.size _Z3foob, .-_Z3foob
.align 2
.p2align 4,,15
.globl main
.type main, @function
main:
.LFB5:
pushl %ebp
.LCFI3:
movl %esp, %ebp
.LCFI4:
subl $8, %esp
.LCFI5:
andl $-16, %esp
movl $1, (%esp)
call _Z3foob
movl $0, (%esp)
call _Z3foob
movl %ebp, %esp
xorl %eax, %eax
popl %ebp
ret
.LFE5:
.size main, .-main
.ident "GCC: (GNU) 3.3.2"
----- cut -----
----- cut -----
gcc -O3 -S -fverbose-asm foo.cc
----- cut -----
.file "foo.cc"
.version "01.01"
# GNU C++ version 2.95.3 20010315 (release) (i686-pc-linux-gnu) compiled by GNU C version 2.95.3 20010315 (release).
# options passed: -O3 -fverbose-asm
# options enabled: -fdefer-pop -fcse-follow-jumps -fcse-skip-blocks
# -fexpensive-optimizations -fthread-jumps -fstrength-reduce -fpeephole
# -fforce-mem -ffunction-cse -finline-functions -finline
# -fkeep-static-consts -fcaller-saves -fpcc-struct-return -fgcse
# -frerun-cse-after-loop -frerun-loop-opt -fschedule-insns2 -fexceptions
# -fcommon -fverbose-asm -fgnu-linker -fregmove -foptimize-register-move
# -fargument-alias -fident -m80387 -mhard-float -mno-soft-float -mieee-fp
# -mfp-ret-in-387 -mschedule-prologue -mcpu=pentiumpro -march=pentium
gcc2_compiled.:
.globl counter
.data
.align 4
.type counter, at object
.size counter,4
counter:
.long 0
.text
.align 4
.globl foo__Fb
.type foo__Fb, at function
foo__Fb:
.LFB1:
pushl %ebp
.LCFI0:
movl %esp,%ebp
.LCFI1:
movl $program,%eax
movl %eax,%ecx
cmpb $0,8(%ebp)
je .L3
movl $998,%ecx
movl $program+3992,%eax
.p2align 4,,7
.L7:
movl $.L8,(%eax)
addl $-4,%eax
decl %ecx
jns .L7
movl $.L10,program+3996
jmp .L2
.p2align 4,,7
.L3:
movl $999999,%edx
jmp *program
.p2align 4,,7
.L8:
incl counter
addl $4,%eax
jmp *(%eax)
.p2align 4,,7
.L10:
testl %edx,%edx
je .L2
movl %ecx,%eax
decl %edx
jmp *program
.p2align 4,,7
.L2:
movl %ebp,%esp
popl %ebp
ret
.LFE1:
.Lfe1:
.size foo__Fb,.Lfe1-foo__Fb
.align 4
.globl main
.type main, at function
main:
.LFB2:
pushl %ebp
.LCFI2:
movl %esp,%ebp
.LCFI3:
subl $8,%esp
.LCFI4:
addl $-12,%esp
addl $-2,%esp
pushw $1
.LCFI5:
call foo__Fb
addl $-12,%esp
addl $-2,%esp
pushw $0
call foo__Fb
xorl %eax,%eax
movl %ebp,%esp
popl %ebp
ret
.LFE2:
.Lfe2:
.size main,.Lfe2-main
.globl program
.bss
.align 32
.type program, at object
.size program,4000
program:
.zero 4000
.section .eh_frame,"aw", at progbits
__FRAME_BEGIN__:
.4byte .LLCIE1
.LSCIE1:
.4byte 0x0
.byte 0x1
.byte 0x0
.byte 0x1
.byte 0x7c
.byte 0x8
.byte 0xc
.byte 0x4
.byte 0x4
.byte 0x88
.byte 0x1
.align 4
.LECIE1:
.set .LLCIE1,.LECIE1-.LSCIE1
.4byte .LLFDE1
.LSFDE1:
.4byte .LSFDE1-__FRAME_BEGIN__
.4byte .LFB1
.4byte .LFE1-.LFB1
.byte 0x4
.4byte .LCFI0-.LFB1
.byte 0xe
.byte 0x8
.byte 0x85
.byte 0x2
.byte 0x4
.4byte .LCFI1-.LCFI0
.byte 0xd
.byte 0x5
.align 4
.LEFDE1:
.set .LLFDE1,.LEFDE1-.LSFDE1
.4byte .LLFDE3
.LSFDE3:
.4byte .LSFDE3-__FRAME_BEGIN__
.4byte .LFB2
.4byte .LFE2-.LFB2
.byte 0x4
.4byte .LCFI2-.LFB2
.byte 0xe
.byte 0x8
.byte 0x85
.byte 0x2
.byte 0x4
.4byte .LCFI3-.LCFI2
.byte 0xd
.byte 0x5
.byte 0x4
.4byte .LCFI5-.LCFI3
.byte 0x2e
.byte 0x10
.align 4
.LEFDE3:
.set .LLFDE3,.LEFDE3-.LSFDE3
.ident "GCC: (GNU) 2.95.3 20010315 (release)"
----- cut -----
> --=-=-=
>
> Kostja,
>
> could you perform some timings for the program attached below when
> compiled with gcc 2.95 and with 3.3.2? Following a query on the gcc
> list, I am attempting to measure whether the difference in compilation
> have any measurable effect.
>
>
> --=-=-=
> Content-Disposition: attachment; filename=foo.cc
> Content-Description: jump stuff
>
> void* program[1000];
> int counter = 0;
>
> static const unsigned int AGAIN = 1000000;
>
> #define DISPATCH(N) goto ** (PC+=N)
>
> void foo (bool init)
> {
> if (init) {
> for (int i=0;i<999;i++) program[i] = && DECR;
> program[999] = && HALT;
> return;
> } else {
> void ** PC = program;
> unsigned int again = AGAIN;
> DISPATCH(0);
> DECR:
> counter += 1;
> DISPATCH(1);
> HALT:
> {
> again -= 1;
> if (again)
> {
> PC=program;
> DISPATCH(0);
> }
> return;
> }
> }
> }
>
> int main()
> {
> foo(true);
> foo(false);
> }
>
> --=-=-=
> Content-Type: text/plain; charset=iso-8859-1
> Content-Transfer-Encoding: 8bit
>
>
> --
> Denys Duchier - Équipe Calligramme - LORIA, Nancy, France
>
> --=-=-=--
> -
> Please send submissions to hackers at mozart-oz.org
> and administriva mail to hackers-request at mozart-oz.org.
> The Mozart Oz web site is at http://www.mozart-oz.org/.
Cheers,
--- Kostja.
More information about the mozart-hackers
mailing list