Re: RISC again
>>>>> "Penio" == Penio Penev <penev@pisa.rockefeller.edu> writes:
Penio> On Mon, 25 Mar 1996, Andrew Haley wrote:
>> RISC instructions are often a bit less dense than CISC. RISC
>> processors handle procedures with a variable number of
>> arguments perfectly correctly.
Penio> I tried to elaborate on this :-)
>> That is: the code does the right thing, and it does it quickly.
>> Compiler writing for RISC architectures is to a large extent a
>> problem which has already been solved. I use such a compiler
>> every day: the code it generates isn't perfectly optimum, but
>> it is correct. (In fact that's not quite true; I have
>> experienced one optimizer bug which generated incorrect code in
>> the course of three years' programming.)
Penio> Actually, to the things that matter most -- loop unrolling
Penio> for floating point routines -- they are very bad (well, the
Penio> SGI MIPS compiler for IRIX 5.3) The compiler, given the
Penio> following C code, produces either far from optimal machine
Penio> code, or _incorrect_ code, depending on the switches you
Penio> feed it:
Penio> /* * Calculate the dot product of two samples */
Penio> double ddot(double *v1, double *v2, int n){ int i; double
Penio> acc;
Penio> for(acc=0,i=0; i<n; i++) acc+=v1[i]*v2[i]; return(acc); }
Penio> This is not a staement about RISC per se, it only says,
Penio> that the problem of RISC C compiler writing is far from
Penio> being solved.
Penio> -- Penio Penev <Penev@pisa.Rockefeller.edu> 1-212-327-7423
I happen to have a Gnu MIPS compiler handy, so I tried it on your
example program. Here are the results. Just how good or bad are
they. What improvements could be made in terms of speed. I am not
too concerned about memory usage.
eli:/usr/rj$ mips-ecoff-gcc -g -O2 -Wa,-ahldn -c ddot.c
1 .file 1 "ddot.c"
2
3 # GNU C 2.7.2 [AL 1.1, MM 40] BSD Mips compiled by GNU C
4
5 # Cc1 defaults:
6
7 # Cc1 arguments (-G value = 8, Cpu = 3000, ISA = 1):
8 # -quiet -dumpbase -g -O2 -o
9
10 gcc2_compiled.:
11 __gnu_compiled_c:
14 .text
15 $Ltext0:
35 .align 2
36 .globl ddot
45
46 .text
47 .text
48 $LM1:
1:ddot.c **** /*
2:ddot.c **** * Calculate the dot product of two samples
3:ddot.c **** */
4:ddot.c ****
5:ddot.c **** double ddot(double *v1, double *v2, int n){
51 .ent ddot
52 ddot:
53 .frame $sp,8,$31 # vars= 8, regs= 0/0, args= 0, extra= 0
54 .mask 0x00000000,0
55 .fmask 0x00000000,0
56 $LBB2:
57 0000 27BDFFF8 subu $sp,$sp,8
58 $LM2:
6:ddot.c **** int i;
7:ddot.c **** double acc;
8:ddot.c ****
9:ddot.c **** for(acc=0,i=0; i<n; i++) acc+=v1[i]*v2[i];
60 0004 44802000 mtc1 $0,$f4
60 00000000
61 000c 44802800 mtc1 $0,$f5
62 0010 00000000 .set noreorder
63 .set nomacro
64 0014 18C00010 blez $6,$L3
65 0018 00001821 move $3,$0
66 .set macro
67 .set reorder
68
69 001c C4810000 $L5:
69 00000000
70 0024 C4800004 l.d $f0,0($4)
70 00000000
70 C4A30000
70 00000000
71 0034 C4A20004 l.d $f2,0($5)
72 0038 00000000 #nop
73 003c 46220002 mul.d $f0,$f0,$f2
74 0040 24A50008 addu $5,$5,8
75 0044 24840008 addu $4,$4,8
76 0048 24630001 addu $3,$3,1
77 004c 0066102A slt $2,$3,$6
78 .set noreorder
79 .set nomacro
80 0050 1440FFF2 bne $2,$0,$L5
81 0054 46202100 add.d $f4,$f4,$f0
82 .set macro
83 .set reorder
84
85 $L3:
86 $LM3:
10:ddot.c **** return(acc);
88 0058 46202006 mov.d $f0,$f4
89 $LM4:
11:ddot.c **** }
91 $LBE2:
92 005c 27BD0008 addu $sp,$sp,8
93 0060 03E00008 j $31
93 00000000
94 0068 00000000 .end ddot
94 00000000
eli:/usr/rj$ mips-ecoff-gcc -g -O3 -ffast-math -fstrength-reduce -fdelayed-branch -fschedule-insns2 -fexpensive-optimizations -funroll-loops -funroll-all-loops -Wa,-ahldn -c ddot.c
1 .file 1 "ddot.c"
2
3 # GNU C 2.7.2 [AL 1.1, MM 40] BSD Mips compiled by GNU C
4
5 # Cc1 defaults:
6
7 # Cc1 arguments (-G value = 8, Cpu = 3000, ISA = 1):
8 # -quiet -dumpbase -g -O3 -ffast-math -fstrength-reduce -fdelayed-branch
9 # -fschedule-insns2 -fexpensive-optimizations -funroll-loops
10 # -funroll-all-loops -o
11
12 gcc2_compiled.:
13 __gnu_compiled_c:
16 .text
17 $Ltext0:
37 .align 2
38 .globl ddot
45
46 .text
47 .text
48 $LM1:
1:ddot.c **** /*
2:ddot.c **** * Calculate the dot product of two samples
3:ddot.c **** */
4:ddot.c ****
5:ddot.c **** double ddot(double *v1, double *v2, int n){
51 .ent ddot
52 ddot:
53 .frame $sp,8,$31 # vars= 8, regs= 0/0, args= 0, extra= 0
54 .mask 0x00000000,0
55 .fmask 0x00000000,0
56 $LBB2:
57 0000 27BDFFF8 subu $sp,$sp,8
58 $LM2:
6:ddot.c **** int i;
7:ddot.c **** double acc;
8:ddot.c ****
9:ddot.c **** for(acc=0,i=0; i<n; i++) acc+=v1[i]*v2[i];
60 0004 44805000 mtc1 $0,$f10
60 00000000
61 000c 44805800 mtc1 $0,$f11
62 0010 00000000 .set noreorder
63 .set nomacro
64 0014 18C0005D blez $6,$L9
65 0018 00003821 move $7,$0
66 .set macro
67 .set reorder
68
69 001c 30C30003 andi $3,$6,0x0003
70 .set noreorder
71 .set nomacro
72 0020 1060002D beq $3,$0,$L11
73 0024 28620002 slt $2,$3,2
74 .set macro
75 .set reorder
76
77 .set noreorder
78 .set nomacro
79 0028 1440001C bne $2,$0,$L15
80 002c 28620003 slt $2,$3,3
81 .set macro
82 .set reorder
83
84 0030 1440000D bne $2,$0,$L16
84 00000000
84 C4A30000
84 00000000
85 0040 C4A20004 l.d $f2,0($5)
85 00000000
86 0048 24A50008 addu $5,$5,8
86 C4810000
86 00000000
87 0054 C4800004 l.d $f0,0($4)
87 00000000
88 005c 24840008 addu $4,$4,8
89 0060 24070001 li $7,0x00000001 # 1
90 0064 46220282 mul.d $f10,$f0,$f2
91 0068 C4830000 $L16:
91 00000000
92 0070 C4820004 l.d $f2,0($4)
92 00000000
92 C4A10000
92 00000000
93 0080 C4A00004 l.d $f0,0($5)
94 0084 00000000 #nop
95 0088 46201082 mul.d $f2,$f2,$f0
96 008c 24A50008 addu $5,$5,8
97 0090 24840008 addu $4,$4,8
98 0094 24E70001 addu $7,$7,1
99 0098 46225280 add.d $f10,$f10,$f2
100 009c C4830000 $L15:
100 00000000
101 00a4 C4820004 l.d $f2,0($4)
101 00000000
101 C4A10000
101 00000000
102 00b4 C4A00004 l.d $f0,0($5)
103 00b8 00000000 #nop
104 00bc 46201082 mul.d $f2,$f2,$f0
105 00c0 24A50008 addu $5,$5,8
106 00c4 24840008 addu $4,$4,8
107 00c8 24E70001 addu $7,$7,1
108 00cc 00E6102A slt $2,$7,$6
109 .set noreorder
110 .set nomacro
111 00d0 1040002E beq $2,$0,$L9
112 00d4 46225280 add.d $f10,$f10,$f2
113 .set macro
114 .set reorder
115
116 00d8 C4870000 $L11:
116 00000000
117 00e0 C4860004 l.d $f6,0($4)
117 00000000
117 C4A10000
117 00000000
118 00f0 C4A00004 l.d $f0,0($5)
119 00f4 00000000 #nop
120 00f8 46203182 mul.d $f6,$f6,$f0
120 C4850008
120 00000000
121 0104 C484000C l.d $f4,8($4)
121 00000000
121 C4A10008
121 00000000
122 0114 C4A0000C l.d $f0,8($5)
123 0118 00000000 #nop
124 011c 46202102 mul.d $f4,$f4,$f0
124 C4830010
124 00000000
125 0128 C4820014 l.d $f2,16($4)
125 00000000
125 C4A10010
125 00000000
126 0138 C4A00014 l.d $f0,16($5)
126 00000000
127 0140 24E70004 addu $7,$7,4
128 0144 46201082 mul.d $f2,$f2,$f0
129 0148 00E6102A slt $2,$7,$6
129 C4A90018
129 00000000
130 0154 C4A8001C l.d $f8,24($5)
130 00000000
130 C4810018
130 00000000
131 0164 C480001C l.d $f0,24($4)
131 00000000
132 016c 46265280 add.d $f10,$f10,$f6
133 0170 46280002 mul.d $f0,$f0,$f8
134 0174 46245280 add.d $f10,$f10,$f4
135 0178 24A50020 addu $5,$5,32
136 017c 46225280 add.d $f10,$f10,$f2
137 0180 24840020 addu $4,$4,32
138 .set noreorder
139 .set nomacro
140 0184 1440FFD4 bne $2,$0,$L11
141 0188 46205280 add.d $f10,$f10,$f0
142 .set macro
143 .set reorder
144
145 $L9:
146 $LM3:
10:ddot.c **** return(acc);
148 018c 46205006 mov.d $f0,$f10
149 $LM4:
11:ddot.c **** }
151 $LBE2:
152 0190 27BD0008 addu $sp,$sp,8
153 0194 03E00008 j $31
153 00000000
154 019c 00000000 .end ddot