Attachment 'm4ri_trsm_UL_LR.patch'
Download 1 # HG changeset patch
2 # User Clement Pernet <[email protected]>
3 # Date 1223885528 -7200
4 # Node ID cafd6739ffedec5f329365aa8d5a9ffb758a55ab
5 # Parent 7d525bf8b16f4c51f6e45e0b09dda5cdf3eae34c
6 Added the 2 remaining trsm and the corresponding testsuite and benchmarks.
7
8 diff -r 7d525bf8b16f -r cafd6739ffed src/trsm.c
9 --- a/src/trsm.c Sat Oct 11 10:16:18 2008 +0200
10 +++ b/src/trsm.c Mon Oct 13 10:12:08 2008 +0200
11 @@ -24,6 +24,7 @@
12 #include "parity.h"
13 #include "stdio.h"
14
15 +/* Upper right case */
16 void mzd_trsm_upper_right (packedmatrix *U, packedmatrix *B, const int cutoff) {
17 if(U->nrows != B->ncols)
18 m4ri_die("mzd_trsm_upper_right: U nrows (%d) need to match B ncols (%d).\n", U->nrows, B->ncols);
19 @@ -130,52 +131,75 @@ void _mzd_trsm_upper_right_weird (packed
20 }
21 }
22
23 +void _mzd_trsm_upper_right_base (packedmatrix* U, packedmatrix* B, const int cutoff){
24 +
25 + size_t mb = B->nrows;
26 + size_t nb = B->ncols;
27 +
28 + for (size_t i=1; i < nb; ++i) {
29 + /* Computes X_i = B_i + X_{0..i-1} U_{0..i-1,i} */
30 + register word ucol = 0;
31 + for (size_t k=0; k<i; ++k) {
32 + if (GET_BIT (U->values[U->rowswap[k]], i))
33 + SET_BIT (ucol, k);
34 + }
35 +
36 + /* doing 64 dotproducts at a time, to use the parity64 parallelism */
37 + size_t giantstep;
38 + word tmp[64];
39 + for (giantstep = 0; giantstep + RADIX < mb; giantstep += RADIX) {
40 + for (size_t babystep = 0; babystep < RADIX; ++babystep)
41 + tmp [babystep] = B->values [B->rowswap [babystep + giantstep]] & ucol;
42 +
43 + word dotprod = parity64 (tmp);
44 +
45 + for (size_t babystep = 0; babystep < RADIX; ++babystep)
46 + if (GET_BIT (dotprod, babystep))
47 + FLIP_BIT (B->values [B->rowswap [giantstep + babystep]], i);
48 + }
49 +
50 + for (size_t babystep = 0; babystep < mb - giantstep; ++babystep)
51 + tmp [babystep] = B->values [B->rowswap [babystep + giantstep]] & ucol;
52 + for (size_t babystep = mb-giantstep; babystep < 64; ++babystep)
53 + tmp [babystep] = 0;
54 +
55 + word dotprod = parity64 (tmp);
56 + for (size_t babystep = 0; babystep < mb - giantstep; ++babystep)
57 + if (GET_BIT (dotprod, babystep))
58 + FLIP_BIT (B->values [B->rowswap [giantstep + babystep ]], i);
59 + }
60 +}
61 +/* void _mzd_trsm_upper_right_base (packedmatrix* U, packedmatrix* B, const int cutoff){ */
62 +/* size_t mb = B->nrows; */
63 +/* size_t nb = B->ncols; */
64 +/* for (size_t i=1; i < nb; ++i) { */
65 +/* /\* Computes X_i = B_i + X_{0..i-1} U_{0..i-1,i} *\/ */
66 +/* packedmatrix *Xm = mzd_init_window(B, 0, 0, mb, i); */
67 +/* packedmatrix *Bm = mzd_init_window(B, 0, i, mb, i+1); */
68 +/* packedmatrix *Um = mzd_init_window(U, 0, i, i, i+1); */
69 +/* _mzd_addmul(Bm, Xm, Um, cutoff); */
70 +/* mzd_free_window(Xm); */
71 +/* mzd_free_window(Um); */
72 +/* mzd_free_window(Bm); */
73 +/* } */
74 +/* } */
75 +
76 /**
77 * Variant where U and B start at an even bit position
78 * Assumes that U->ncols < 64
79 */
80 +#define TRSM_THRESHOLD 64
81 +
82 void _mzd_trsm_upper_right_even (packedmatrix *U, packedmatrix *B, const int cutoff) {
83
84 size_t mb = B->nrows;
85 size_t nb = B->ncols;
86 -
87 - if (nb <= RADIX){
88 +
89 + if (nb <= TRSM_THRESHOLD){
90 /* base case */
91 - for (size_t i=1; i < nb; ++i) {
92 -
93 - /* Computes X_i = B_i + X_{0..i-1} U_{0..i-1,i} */
94 -
95 - register word ucol = 0;
96 - for (size_t k=0; k<i; ++k) {
97 - if (GET_BIT (U->values[U->rowswap[k]], i))
98 - SET_BIT (ucol, k);
99 - }
100 -
101 - /* doing 64 dotproducts at a time, to use the parity64 parallelism */
102 - size_t giantstep;
103 - word tmp[64];
104 - for (giantstep = 0; giantstep + RADIX < mb; giantstep += RADIX) {
105 - for (size_t babystep = 0; babystep < RADIX; ++babystep)
106 - tmp [babystep] = B->values [B->rowswap [babystep + giantstep]] & ucol;
107 -
108 - word dotprod = parity64 (tmp);
109 -
110 - for (size_t babystep = 0; babystep < RADIX; ++babystep)
111 - if (GET_BIT (dotprod, babystep))
112 - FLIP_BIT (B->values [B->rowswap [giantstep + babystep]], i);
113 - }
114 -
115 - for (size_t babystep = 0; babystep < mb - giantstep; ++babystep)
116 - tmp [babystep] = B->values [B->rowswap [babystep + giantstep]] & ucol;
117 - for (size_t babystep = mb-giantstep; babystep < 64; ++babystep)
118 - tmp [babystep] = 0;
119 -
120 - word dotprod = parity64 (tmp);
121 - for (size_t babystep = 0; babystep < mb - giantstep; ++babystep)
122 - if (GET_BIT (dotprod, babystep))
123 - FLIP_BIT (B->values [B->rowswap [giantstep + babystep ]], i);
124 - }
125 - } else {
126 + _mzd_trsm_upper_right_base (U, B, cutoff);
127 + }
128 + else {
129 size_t nb1 = (((nb-1) / RADIX + 1) >> 1) * RADIX;
130
131 packedmatrix *B0 = mzd_init_window(B, 0, 0, mb, nb1);
132 @@ -196,6 +220,208 @@ void _mzd_trsm_upper_right_even (packedm
133 mzd_free_window(U11);
134 }
135 }
136 +
137 +/* Lower right case */
138 +void mzd_trsm_lower_right (packedmatrix *L, packedmatrix *B, const int cutoff) {
139 + if(L->nrows != B->ncols)
140 + m4ri_die("mzd_trsm_lower_right: L nrows (%d) need to match B ncols (%d).\n", L->nrows, B->ncols);
141 + if(L->nrows != L->ncols)
142 + m4ri_die("mzd_trsm_lower_right: L must be square and is found to be (%d) x (%d).\n", L->nrows, L->ncols);
143 +
144 + if (cutoff <= 0)
145 + m4ri_die("mzd_trsm_lower_right: cutoff must be > 0.\n");
146 +
147 + _mzd_trsm_lower_right (L, B, cutoff);
148 +}
149 +
150 +void _mzd_trsm_lower_right (packedmatrix *L, packedmatrix *B, const int cutoff) {
151 + size_t nb = B->ncols;
152 + size_t mb = B->nrows;
153 + size_t n1 = RADIX-B->offset;
154 + if (nb <= n1)
155 + _mzd_trsm_lower_right_weird (L, B, cutoff);
156 + else{
157 + /**
158 + \verbatim
159 +
160 + |\
161 + | \
162 + | \
163 + |L00\
164 + |____\
165 + | |\
166 + | | \
167 + | | \
168 + |L10 |L11\
169 + |____|____\
170 + _________
171 + |B0 |B1 |
172 + |____|____|
173 + \endverbatim
174 +
175 + * L00 and B0 are possibly located at uneven locations.
176 + * Their column dimension is lower than 64
177 + * The first column of L10, L11, B1 are aligned to words.
178 + */
179 + packedmatrix *B0 = mzd_init_window (B, 0, 0, mb, n1);
180 + packedmatrix *B1 = mzd_init_window (B, 0, n1, mb, nb);
181 + packedmatrix *L00 = mzd_init_window (L, 0, 0, n1, n1);
182 + packedmatrix *L10 = mzd_init_window (L, n1, 0, nb, n1);
183 + packedmatrix *L11 = mzd_init_window (L, n1, n1, nb, nb);
184 +
185 + _mzd_trsm_lower_right_even (L11, B1, cutoff);
186 + mzd_addmul (B0, B1, L10, cutoff);
187 + _mzd_trsm_lower_right_weird (L00, B0, cutoff);
188 +
189 + mzd_free_window(B0);
190 + mzd_free_window(B1);
191 +
192 + mzd_free_window(L00);
193 + mzd_free_window(L10);
194 + mzd_free_window(L11);
195 + }
196 +}
197 +
198 +/**
199 + * Variant where L and B start at an odd bit position
200 + * Assumes that L->ncols < 64
201 + */
202 +void _mzd_trsm_lower_right_weird (packedmatrix *L, packedmatrix *B, const int cutoff) {
203 +
204 + size_t mb = B->nrows;
205 + size_t nb = B->ncols;
206 + size_t offset = B->offset;
207 +
208 + for (int i=nb-1; i >=0; --i) {
209 +
210 + /* Computes X_i = B_i + X_{i+1,n} L_{i+1..n,i} */
211 +
212 + register word ucol = 0;
213 + for (size_t k=i+1; k<nb; ++k) {
214 + if (GET_BIT (L->values[L->rowswap[k]], i + L->offset))
215 + SET_BIT (ucol, k+offset);
216 + }
217 + /* doing 64 dotproducts at a time, to use the parity64 parallelism */
218 + size_t giantstep;
219 + word tmp[64];
220 + for (giantstep = 0; giantstep + RADIX < mb; giantstep += RADIX) {
221 + for (size_t babystep = 0; babystep < RADIX; ++babystep)
222 + tmp [babystep] = B->values [B->rowswap [babystep + giantstep]] & ucol;
223 +
224 + word dotprod = parity64 (tmp);
225 +
226 + for (size_t babystep = 0; babystep < RADIX; ++babystep)
227 + if (GET_BIT (dotprod, babystep))
228 + FLIP_BIT (B->values [B->rowswap [giantstep + babystep]], i + offset);
229 + }
230 + for (size_t babystep = 0; babystep < mb - giantstep; ++babystep){
231 + tmp [babystep] = B->values [B->rowswap [babystep + giantstep]] & ucol;
232 +
233 + }
234 + for (size_t babystep = mb-giantstep; babystep < 64; ++babystep){
235 + tmp [babystep] = 0;
236 + }
237 +
238 + word dotprod = parity64 (tmp);
239 +
240 + for (size_t babystep = 0; babystep < mb - giantstep; ++babystep)
241 + if (GET_BIT (dotprod, babystep))
242 + FLIP_BIT (B->values [B->rowswap [giantstep + babystep ]], i + offset);
243 + }
244 +}
245 +
246 +void _mzd_trsm_lower_right_base (packedmatrix* L, packedmatrix* B, const int cutoff){
247 +
248 + size_t mb = B->nrows;
249 + size_t nb = B->ncols;
250 +
251 + for (int i=nb-1; i >=0; --i) {
252 +
253 + /* Computes X_i = B_i + X_{i+1,n} L_{i+1..n,i} */
254 +
255 + register word ucol = 0;
256 + for (size_t k=i+1; k<nb; ++k) {
257 + if (GET_BIT (L->values[L->rowswap[k]], i))
258 + SET_BIT (ucol, k);
259 + }
260 +
261 + /* doing 64 dotproducts at a time, to use the parity64 parallelism */
262 + size_t giantstep;
263 + word tmp[64];
264 + for (giantstep = 0; giantstep + RADIX < mb; giantstep += RADIX) {
265 + for (size_t babystep = 0; babystep < RADIX; ++babystep)
266 + tmp [babystep] = B->values [B->rowswap [babystep + giantstep]] & ucol;
267 +
268 + word dotprod = parity64 (tmp);
269 +
270 + for (size_t babystep = 0; babystep < RADIX; ++babystep)
271 + if (GET_BIT (dotprod, babystep))
272 + FLIP_BIT (B->values [B->rowswap [giantstep + babystep]], i);
273 + }
274 +
275 + for (size_t babystep = 0; babystep < mb - giantstep; ++babystep)
276 + tmp [babystep] = B->values [B->rowswap [babystep + giantstep]] & ucol;
277 + for (size_t babystep = mb-giantstep; babystep < 64; ++babystep)
278 + tmp [babystep] = 0;
279 +
280 + word dotprod = parity64 (tmp);
281 + for (size_t babystep = 0; babystep < mb - giantstep; ++babystep)
282 + if (GET_BIT (dotprod, babystep))
283 + FLIP_BIT (B->values [B->rowswap [giantstep + babystep ]], i);
284 + }
285 +}
286 +/* void _mzd_trsm_lower_right_base (packedmatrix* L, packedmatrix* B, const int cutoff){ */
287 +/* size_t mb = B->nrows; */
288 +/* size_t nb = B->ncols; */
289 +/* for (size_t i=1; i < nb; ++i) { */
290 +/* /\* Computes X_i = B_i + X_{0..i-1} L_{0..i-1,i} *\/ */
291 +/* packedmatrix *Xm = mzd_init_window(B, 0, 0, mb, i); */
292 +/* packedmatrix *Bm = mzd_init_window(B, 0, i, mb, i+1); */
293 +/* packedmatrix *Lm = mzd_init_window(L, 0, i, i, i+1); */
294 +/* _mzd_addmul(Bm, Xm, Lm, cutoff); */
295 +/* mzd_free_window(Xm); */
296 +/* mzd_free_window(Lm); */
297 +/* mzd_free_window(Bm); */
298 +/* } */
299 +/* } */
300 +
301 +/**
302 + * Variant where L and B start at an even bit position
303 + * Assumes that L->ncols < 64
304 + */
305 +#define TRSM_THRESHOLD 64
306 +
307 +void _mzd_trsm_lower_right_even (packedmatrix *L, packedmatrix *B, const int cutoff) {
308 +
309 + size_t mb = B->nrows;
310 + size_t nb = B->ncols;
311 +
312 + if (nb <= TRSM_THRESHOLD){
313 + /* base case */
314 + _mzd_trsm_lower_right_base (L, B, cutoff);
315 + }
316 + else {
317 + size_t nb1 = (((nb-1) / RADIX + 1) >> 1) * RADIX;
318 +
319 + packedmatrix *B0 = mzd_init_window(B, 0, 0, mb, nb1);
320 + packedmatrix *B1 = mzd_init_window(B, 0, nb1, mb, nb);
321 + packedmatrix *L00 = mzd_init_window(L, 0, 0, nb1, nb1);
322 + packedmatrix *L10 = mzd_init_window(L, nb1, 0, nb, nb1);
323 + packedmatrix *L11 = mzd_init_window(L, nb1, nb1, nb, nb);
324 +
325 + _mzd_trsm_lower_right_even (L11, B1, cutoff);
326 + mzd_addmul (B0, B1, L10, cutoff);
327 + _mzd_trsm_lower_right_even (L00, B0, cutoff);
328 +
329 + mzd_free_window(B0);
330 + mzd_free_window(B1);
331 +
332 + mzd_free_window(L00);
333 + mzd_free_window(L10);
334 + mzd_free_window(L11);
335 + }
336 +}
337 +
338
339 /* Lower Left Implementations */
340
341 @@ -236,7 +462,7 @@ void _mzd_trsm_lower_left (packedmatrix
342 * |L10 |L11\ | |
343 * |____|____\ |______|
344 *
345 - * L00 L10 and B0 are possibly located at uneven locations.
346 + * L00 L10 B0 and B1 are possibly located at uneven locations.
347 * Their column dimension is lower than 64
348 * The first column of L01, L11, B1 are aligned to words.
349 */
350 @@ -279,6 +505,7 @@ void _mzd_trsm_lower_left_weird (packedm
351 mask_begin = ~mask_begin;
352 word mask_end = LEFT_BITMASK(nbrest);
353
354 + // L[0,0] = 1, so no work required for i=0
355 for (size_t i=1; i < mb; ++i) {
356
357 /* Computes X_i = B_i + L_{i,0..i-1} X_{0..i-1} */
358 @@ -390,3 +617,201 @@ void _mzd_trsm_lower_left_even (packedma
359 mzd_free_window(L11);
360 }
361 }
362 +
363 +
364 +
365 +/* Upper Left Implementations */
366 +
367 +void mzd_trsm_upper_left (packedmatrix *U, packedmatrix *B, const int cutoff) {
368 + if(U->ncols != B->nrows)
369 + m4ri_die("mzd_trsm_upper_left: U ncols (%d) need to match B nrows (%d).\n", U->ncols, B->nrows);
370 + if(U->nrows != U->ncols)
371 + m4ri_die("mzd_trsm_upper_left: U must be square and is found to be (%d) x (%d).\n", U->nrows, U->ncols);
372 +
373 + if (cutoff <= 0)
374 + m4ri_die("mzd_trsm_upper_left: cutoff must be > 0.\n");
375 +
376 + _mzd_trsm_upper_left (U, B, cutoff);
377 +}
378 +
379 +void _mzd_trsm_upper_left (packedmatrix *U, packedmatrix *B, const int cutoff) {
380 +
381 +
382 + if (!U->offset)
383 + _mzd_trsm_upper_left_even (U, B, cutoff);
384 + else{
385 + size_t nb = B->ncols;
386 + size_t mb = B->nrows;
387 + size_t m1 = RADIX - U->offset;
388 + if (mb <= m1)
389 + _mzd_trsm_upper_left_weird (U, B, cutoff);
390 + else{
391 + /**
392 + * __________ ______
393 + * \ U00| | | |
394 + * \ |U01 | | |
395 + * \ | | | B0 |
396 + * \ | | | |
397 + * \|____| |______|
398 + * \ | | |
399 + * \U11| | |
400 + * \ | | B1 |
401 + * \ | | |
402 + * \| |______|
403 + *
404 + * U00, B0 and B1 are possibly located at uneven locations.
405 + * Their column dimension is greater than 64
406 + * The first column of U01, U11, B0 and B1 are aligned to words.
407 + */
408 +
409 + packedmatrix *B0 = mzd_init_window (B, 0, 0, m1, nb);
410 + packedmatrix *B1 = mzd_init_window (B, m1, 0, mb, nb);
411 + packedmatrix *U00 = mzd_init_window (U, 0, 0, m1, m1);
412 + packedmatrix *U01 = mzd_init_window (U, 0, m1, m1, mb);
413 + packedmatrix *U11 = mzd_init_window (U, m1, m1, mb, mb);
414 +
415 + _mzd_trsm_upper_left_even (U11, B1, cutoff);
416 + mzd_addmul (B0, U01, B1, cutoff);
417 + _mzd_trsm_upper_left_weird (U00, B0, cutoff);
418 +
419 + mzd_free_window(B0);
420 + mzd_free_window(B1);
421 +
422 + mzd_free_window(U00);
423 + mzd_free_window(U01);
424 + mzd_free_window(U11);
425 + }
426 + }
427 +}
428 +
429 +/**
430 + * Variant where U and B start at an odd bit position
431 + * Assumes that U->ncols < 64
432 + */
433 +void _mzd_trsm_upper_left_weird (packedmatrix *U, packedmatrix *B, const int cutoff) {
434 +
435 + size_t mb = B->nrows;
436 + size_t nb = B->ncols;
437 + size_t Boffset = B->offset;
438 + size_t nbrest = (nb + Boffset) % RADIX;
439 + if (nb + Boffset >= RADIX) {
440 +
441 + // Large B
442 + word mask_begin = RIGHT_BITMASK(RADIX-B->offset);
443 + if (B->offset == 0)
444 + mask_begin = ~mask_begin;
445 + word mask_end = LEFT_BITMASK(nbrest);
446 +
447 + // U[mb-1,mb-1] = 1, so no work required for i=mb-1
448 + for (int i=mb-2; i >= 0; --i) {
449 +
450 + /* Computes X_i = B_i + U_{i,i+1..mb} X_{i+1..mb} */
451 + size_t Uidx = U->rowswap[i];
452 + size_t Bidx = B->rowswap[i];
453 +
454 + for (size_t k=i+1; k<mb; ++k) {
455 + if (GET_BIT (U->values [Uidx], k + U->offset)){
456 + B->values [Bidx] ^= B->values [B->rowswap [k]] & mask_begin;
457 + for (size_t j = 1; j < B->width-1; ++j)
458 + B->values [Bidx + j] ^= B->values [B->rowswap [k] + j];
459 + B->values [Bidx + B->width - 1] ^= B->values [B->rowswap [k] + B->width - 1] & mask_end;
460 + }
461 + }
462 + }
463 + } else { // Small B
464 +
465 + word mask = ((ONE << nb) - 1) ;
466 + mask <<= (RADIX-nb-B->offset);
467 +
468 + // U[mb-1,mb-1] = 1, so no work required for i=mb-1
469 + for (int i=mb-2; i >= 0; --i) {
470 + /* Computes X_i = B_i + U_{i,i+1..mb} X_{i+1..mb} */
471 + size_t Uidx = U->rowswap[i];
472 + size_t Bidx = B->rowswap[i];
473 +
474 + for (size_t k=i+1; k<mb; ++k) {
475 + if (GET_BIT (U->values [Uidx], k + U->offset)){
476 + B->values [Bidx] ^= B->values [B->rowswap [k]] & mask;
477 + }
478 + }
479 + }
480 + }
481 +}
482 +
483 +/**
484 + * Variant where U and B start at an odd bit position
485 + * Assumes that U->ncols < 64
486 + */
487 +void _mzd_trsm_upper_left_even (packedmatrix *U, packedmatrix *B, const int cutoff) {
488 +
489 + size_t mb = B->nrows;
490 + size_t nb = B->ncols;
491 + size_t Boffset = B->offset;
492 + size_t nbrest = (nb + Boffset) % RADIX;
493 +
494 + if (mb <= RADIX){
495 + /* base case */
496 +
497 + if (nb + B->offset >= RADIX) {
498 + // B is large
499 + word mask_begin = RIGHT_BITMASK(RADIX-B->offset);
500 + if (B->offset == 0)
501 + mask_begin = ~mask_begin;
502 + word mask_end = LEFT_BITMASK(nbrest);
503 +
504 + // U[mb-1,mb-1] = 1, so no work required for i=mb-1
505 + for (int i=mb-2; i >= 0; --i) {
506 +
507 + /* Computes X_i = B_i + U_{i,i+1..mb} X_{i+1..mb} */
508 + size_t Uidx = U->rowswap[i];
509 + size_t Bidx = B->rowswap[i];
510 +
511 + for (size_t k=i+1; k<mb; ++k) {
512 + if (GET_BIT (U->values [Uidx], k)){
513 + B->values [Bidx] ^= B->values [B->rowswap [k]] & mask_begin;
514 + for (size_t j = 1; j < B->width-1; ++j)
515 + B->values [Bidx + j] ^= B->values [B->rowswap [k] + j];
516 + B->values [Bidx + B->width - 1] ^= B->values [B->rowswap [k] + B->width - 1] & mask_end;
517 + }
518 + }
519 + }
520 + } else { // B is small
521 + word mask = ((ONE << nb) - 1) ;
522 + mask <<= (RADIX-nb-B->offset);
523 + // U[mb-1,mb-1] = 1, so no work required for i=mb-1
524 + for (int i=mb-2; i >= 0; --i) {
525 +
526 + /* Computes X_i = B_i + U_{i,i+1..mb} X_{i+1..mb} */
527 + size_t Uidx = U->rowswap [i];
528 + size_t Bidx = B->rowswap [i];
529 +
530 + for (size_t k=i+1; k<mb; ++k) {
531 + if (GET_BIT (U->values [Uidx], k)){
532 + B->values [Bidx] ^= B->values [B->rowswap[k]] & mask;
533 + }
534 + }
535 + }
536 + }
537 + } else {
538 + size_t mb1 = (((mb-1) / RADIX + 1) >> 1) * RADIX;
539 +
540 + packedmatrix *B0 = mzd_init_window(B, 0, 0, mb1, nb);
541 + packedmatrix *B1 = mzd_init_window(B, mb1, 0, mb, nb);
542 + packedmatrix *U00 = mzd_init_window(U, 0, 0, mb1, mb1);
543 + packedmatrix *U01 = mzd_init_window(U, 0, mb1, mb1, mb);
544 + packedmatrix *U11 = mzd_init_window(U, mb1, mb1, mb, mb);
545 +
546 + _mzd_trsm_upper_left_even (U11, B1, cutoff);
547 +
548 + _mzd_addmul (B0, U01, B1, cutoff);
549 +
550 + _mzd_trsm_upper_left_even (U00, B0, cutoff);
551 +
552 + mzd_free_window(B0);
553 + mzd_free_window(B1);
554 +
555 + mzd_free_window(U00);
556 + mzd_free_window(U01);
557 + mzd_free_window(U11);
558 + }
559 +}
560 diff -r 7d525bf8b16f -r cafd6739ffed src/trsm.h
561 --- a/src/trsm.h Sat Oct 11 10:16:18 2008 +0200
562 +++ b/src/trsm.h Mon Oct 13 10:12:08 2008 +0200
563 @@ -75,6 +75,43 @@ void _mzd_trsm_upper_right_weird (packed
564 /**
565 * \brief TRiangular System solving with matrix.
566 *
567 + * Solves X L = B where X and B are matrices, and L is lower triangular.
568 + * X is stored inplace on B
569 + * *
570 + * This is the wrapper function including bounds checks. See
571 + * _mzd_trsm_upper_right for implementation details.
572 + *
573 + * \param L Input upper triangular matrix.
574 + * \param B Input matrix, being overwritten by the solution matrix X
575 + * \param cutoff Minimal dimension for Strassen recursion.
576 + *
577 + */
578 +
579 +void mzd_trsm_lower_right (packedmatrix *L, packedmatrix *B, const int cutoff);
580 +
581 +/**
582 + * \brief TRiangular System solving with matrix.
583 + *
584 + * Solves X L = B where X and B are matrices, and L is lower triangular.
585 + * This version assumes that the matrices are at an even position on
586 + * the RADIX grid and that their dimension is a multiple of RADIX.
587 + * X is stored inplace on B
588 + *
589 + * \param L Input lower triangular matrix.
590 + * \param B Input matrix, being overwritten by the solution matrix X
591 + * \param cutoff Minimal dimension for Strassen recursion.
592 + *
593 + * \internal
594 + */
595 +void _mzd_trsm_lower_right (packedmatrix *L, packedmatrix *B, const int cutoff);
596 +
597 +void _mzd_trsm_lower_right_even (packedmatrix *L, packedmatrix *B, const int cutoff);
598 +
599 +void _mzd_trsm_lower_right_weird (packedmatrix *L, packedmatrix *B, const int cutoff);
600 +
601 +/**
602 + * \brief TRiangular System solving with matrix.
603 + *
604 * Solves L X = B where X and B are matrices, and L is lower triangular.
605 * X is stored inplace on B
606 *
607 @@ -107,4 +144,40 @@ void _mzd_trsm_lower_left_even (packedma
608
609 void _mzd_trsm_lower_left_weird (packedmatrix *L, packedmatrix *B, const int cutoff);
610
611 +
612 +/**
613 + * \brief TRiangular System solving with matrix.
614 + *
615 + * Solves U X = B where X and B are matrices, and U is upper triangular.
616 + * X is stored inplace on B
617 + *
618 + * This is the wrapper function including bounds checks. See
619 + * _mzd_trsm_upper_left for implementation details.
620 + *
621 + * \param U Input upper triangular matrix.
622 + * \param B Input matrix, being overwritten by the solution matrix X
623 + * \param cutoff Minimal dimension for Strassen recursion.
624 + *
625 + */
626 +
627 +void mzd_trsm_upper_left (packedmatrix *U, packedmatrix *B, const int cutoff);
628 +
629 +/**
630 + * \brief TRiangular System solving with matrix.
631 + *
632 + * Solves U X = B where X and B are matrices, and U is upper triangular.
633 + * X is stored inplace on B
634 + *
635 + * \param U Input upper triangular matrix.
636 + * \param B Input matrix, being overwritten by the solution matrix X
637 + * \param cutoff Minimal dimension for Strassen recursion.
638 + *
639 + * \internal
640 + */
641 +void _mzd_trsm_upper_left (packedmatrix *U, packedmatrix *B, const int cutoff);
642 +
643 +void _mzd_trsm_upper_left_even (packedmatrix *U, packedmatrix *B, const int cutoff);
644 +
645 +void _mzd_trsm_upper_left_weird (packedmatrix *U, packedmatrix *B, const int cutoff);
646 +
647 #endif
648 diff -r 7d525bf8b16f -r cafd6739ffed testsuite/Makefile
649 --- a/testsuite/Makefile Sat Oct 11 10:16:18 2008 +0200
650 +++ b/testsuite/Makefile Mon Oct 13 10:12:08 2008 +0200
651 @@ -4,7 +4,7 @@ DEBUG=-ggdb
652
653
654
655 -PRGS=test_elimination test_multiplication bench_elimination bench_multiplication bench_addition test_trsm
656 +PRGS=test_elimination test_multiplication bench_elimination bench_multiplication bench_addition test_trsm bench_trsm_lowerleft bench_trsm_upperright bench_trsm_upperleft bench_trsm_lowerright
657
658 CPUCYCLES_DIR=./cpucycles-20060326
659
660 diff -r 7d525bf8b16f -r cafd6739ffed testsuite/bench_trsm_lowerleft.c
661 --- /dev/null Thu Jan 01 00:00:00 1970 +0000
662 +++ b/testsuite/bench_trsm_lowerleft.c Mon Oct 13 10:12:08 2008 +0200
663 @@ -0,0 +1,36 @@
664 +#include <stdlib.h>
665 +
666 +#include "cpucycles.h"
667 +#include "walltime.h"
668 +#include "m4ri.h"
669 +
670 +int main(int argc, char **argv) {
671 + int n,m;
672 + unsigned long long t;
673 + double wt;
674 + double clockZero = 0.0;
675 +
676 + if (argc != 3) {
677 + m4ri_die("Parameters m, n expected.\n");
678 + }
679 + m = atoi(argv[1]);
680 + n = atoi(argv[2]);
681 + packedmatrix *B = mzd_init(m, n);
682 + packedmatrix *L = mzd_init(m, m);
683 + mzd_randomize(B);
684 + mzd_randomize(L);
685 + size_t i,j;
686 + for (i=0; i<n; ++i){
687 + for (j=i+1; j<n;++j)
688 + mzd_write_bit(L,i,j, 0);
689 + mzd_write_bit(L,i,i, 1);
690 + }
691 +
692 + wt = walltime(&clockZero);
693 + t = cpucycles();
694 + mzd_trsm_lower_left(L, B, 2048);
695 + printf("n: %5d, cpu cycles: %llu wall time: %lf\n",n, cpucycles() - t, walltime(&wt));
696 +
697 + mzd_free(B);
698 + mzd_free(L);
699 +}
700 diff -r 7d525bf8b16f -r cafd6739ffed testsuite/bench_trsm_lowerright.c
701 --- /dev/null Thu Jan 01 00:00:00 1970 +0000
702 +++ b/testsuite/bench_trsm_lowerright.c Mon Oct 13 10:12:08 2008 +0200
703 @@ -0,0 +1,36 @@
704 +#include <stdlib.h>
705 +
706 +#include "cpucycles.h"
707 +#include "walltime.h"
708 +#include "m4ri.h"
709 +
710 +int main(int argc, char **argv) {
711 + int n,m;
712 + unsigned long long t;
713 + double wt;
714 + double clockZero = 0.0;
715 +
716 + if (argc != 3) {
717 + m4ri_die("Parameters m, n expected.\n");
718 + }
719 + m = atoi(argv[1]);
720 + n = atoi(argv[2]);
721 + packedmatrix *B = mzd_init(m, n);
722 + packedmatrix *L = mzd_init(n, n);
723 + mzd_randomize(B);
724 + mzd_randomize(L);
725 + size_t i,j;
726 + for (i=0; i<n; ++i){
727 + for (j=i+1; j<n;++j)
728 + mzd_write_bit(L,i,j, 0);
729 + mzd_write_bit(L,i,i, 1);
730 + }
731 +
732 + wt = walltime(&clockZero);
733 + t = cpucycles();
734 + mzd_trsm_lower_right(L, B, 2048);
735 + printf("n: %5d, cpu cycles: %llu wall time: %lf\n",n, cpucycles() - t, walltime(&wt));
736 +
737 + mzd_free(B);
738 + mzd_free(L);
739 +}
740 diff -r 7d525bf8b16f -r cafd6739ffed testsuite/bench_trsm_upperleft.c
741 --- /dev/null Thu Jan 01 00:00:00 1970 +0000
742 +++ b/testsuite/bench_trsm_upperleft.c Mon Oct 13 10:12:08 2008 +0200
743 @@ -0,0 +1,36 @@
744 +#include <stdlib.h>
745 +
746 +#include "cpucycles.h"
747 +#include "walltime.h"
748 +#include "m4ri.h"
749 +
750 +int main(int argc, char **argv) {
751 + int n,m;
752 + unsigned long long t;
753 + double wt;
754 + double clockZero = 0.0;
755 +
756 + if (argc != 3) {
757 + m4ri_die("Parameters m, n expected.\n");
758 + }
759 + m = atoi(argv[1]);
760 + n = atoi(argv[2]);
761 + packedmatrix *B = mzd_init(m, n);
762 + packedmatrix *U = mzd_init(m, m);
763 + mzd_randomize(B);
764 + mzd_randomize(U);
765 + size_t i,j;
766 + for (i=0; i<n; ++i){
767 + for (j=0; j<i;++j)
768 + mzd_write_bit(U,i,j, 0);
769 + mzd_write_bit(U,i,i, 1);
770 + }
771 +
772 + wt = walltime(&clockZero);
773 + t = cpucycles();
774 + mzd_trsm_upper_left(U, B, 2048);
775 + printf("n: %5d, cpu cycles: %llu wall time: %lf\n",n, cpucycles() - t, walltime(&wt));
776 +
777 + mzd_free(B);
778 + mzd_free(U);
779 +}
780 diff -r 7d525bf8b16f -r cafd6739ffed testsuite/bench_trsm_upperright.c
781 --- /dev/null Thu Jan 01 00:00:00 1970 +0000
782 +++ b/testsuite/bench_trsm_upperright.c Mon Oct 13 10:12:08 2008 +0200
783 @@ -0,0 +1,36 @@
784 +#include <stdlib.h>
785 +
786 +#include "cpucycles.h"
787 +#include "walltime.h"
788 +#include "m4ri.h"
789 +
790 +int main(int argc, char **argv) {
791 + int n,m;
792 + unsigned long long t;
793 + double wt;
794 + double clockZero = 0.0;
795 +
796 + if (argc != 3) {
797 + m4ri_die("Parameters m, n expected.\n");
798 + }
799 + m = atoi(argv[1]);
800 + n = atoi(argv[2]);
801 + packedmatrix *B = mzd_init(m, n);
802 + packedmatrix *U = mzd_init(n, n);
803 + mzd_randomize(B);
804 + mzd_randomize(U);
805 + size_t i,j;
806 + for (i=0; i<n; ++i){
807 + for (j=0; j<i;++j)
808 + mzd_write_bit(U,i,j, 0);
809 + mzd_write_bit(U,i,i, 1);
810 + }
811 +
812 + wt = walltime(&clockZero);
813 + t = cpucycles();
814 + mzd_trsm_upper_right(U, B, 2048);
815 + printf("n: %5d, cpu cycles: %llu wall time: %lf\n",n, cpucycles() - t, walltime(&wt));
816 +
817 + mzd_free(B);
818 + mzd_free(U);
819 +}
820 diff -r 7d525bf8b16f -r cafd6739ffed testsuite/test_trsm.c
821 --- a/testsuite/test_trsm.c Sat Oct 11 10:16:18 2008 +0200
822 +++ b/testsuite/test_trsm.c Mon Oct 13 10:12:08 2008 +0200
823 @@ -45,6 +45,59 @@ int test_trsm_upper_right (int m, int n,
824 mzd_free_window (B);
825 mzd_free (W);
826 mzd_free(Ubase);
827 + mzd_free(Bbase);
828 + mzd_free(Bbasecopy);
829 +
830 + if (!status)
831 + printf("passed\n");
832 + else
833 + printf("failed\n");
834 + return status;
835 +}
836 +
837 +int test_trsm_lower_right (int m, int n, int offset){
838 + packedmatrix* Lbase = mzd_init (2048,2048);
839 + packedmatrix* Bbase = mzd_init (2048,2048);
840 + mzd_randomize (Lbase);
841 + mzd_randomize (Bbase);
842 + packedmatrix* Bbasecopy = mzd_copy (NULL, Bbase);
843 +
844 + packedmatrix* L = mzd_init_window (Lbase, 0, offset, n, offset + n);
845 + packedmatrix* B = mzd_init_window (Bbase, 0, offset, m, offset + n);
846 + packedmatrix* W = mzd_copy (NULL, B);
847 +
848 + size_t i,j;
849 + for (i=0; i<n; ++i){
850 + for (j=i+1; j<n;++j)
851 + mzd_write_bit(L,i,j, 0);
852 + mzd_write_bit(L,i,i, 1);
853 + }
854 + mzd_trsm_lower_right (L, B, 2048);
855 +
856 + mzd_addmul(W, B, L, 2048);
857 +
858 + int status = 0;
859 + for ( i=0; i<m; ++i)
860 + for ( j=0; j<n; ++j){
861 + if (mzd_read_bit (W,i,j)){
862 + status = 1;
863 + }
864 + }
865 +
866 + // Verifiying that nothing has been changed around the submatrices
867 + mzd_addmul(W, B, L, 2048);
868 + mzd_copy (B, W);
869 +
870 + for ( i=0; i<2048; ++i)
871 + for ( j=0; j<2048/RADIX; ++j){
872 + if (Bbase->values [Bbase->rowswap[i] + j] != Bbasecopy->values [Bbasecopy->rowswap[i] + j]){
873 + status = 1;
874 + }
875 + }
876 + mzd_free_window (L);
877 + mzd_free_window (B);
878 + mzd_free (W);
879 + mzd_free(Lbase);
880 mzd_free(Bbase);
881 mzd_free(Bbasecopy);
882
883 @@ -110,6 +163,62 @@ int test_trsm_lower_left (int m, int n,
884 return status;
885 }
886
887 +
888 +
889 +int test_trsm_upper_left (int m, int n, int offsetU, int offsetB){
890 + packedmatrix* Ubase = mzd_init (2048,2048);
891 + packedmatrix* Bbase = mzd_init (2048,2048);
892 + mzd_randomize (Ubase);
893 + mzd_randomize (Bbase);
894 + packedmatrix* Bbasecopy = mzd_copy (NULL, Bbase);
895 +
896 + packedmatrix* U = mzd_init_window (Ubase, 0, offsetU, m, offsetU + m);
897 + packedmatrix* B = mzd_init_window (Bbase, 0, offsetB, m, offsetB + n);
898 + packedmatrix* W = mzd_copy (NULL, B);
899 +
900 + size_t i,j;
901 + for (i=0; i<m; ++i){
902 + for (j=0; j<i;++j)
903 + mzd_write_bit(U,i,j, 0);
904 + mzd_write_bit(U,i,i, 1);
905 + }
906 + mzd_trsm_upper_left(U, B, 2048);
907 +
908 + mzd_addmul(W, U, B, 2048);
909 +
910 + int status = 0;
911 + for ( i=0; i<m; ++i)
912 + for ( j=0; j<n; ++j){
913 + if (mzd_read_bit (W,i,j)){
914 + status = 1;
915 + }
916 + }
917 +
918 + // Verifiying that nothing has been changed around the submatrices
919 + mzd_addmul(W, U, B, 2048);
920 +
921 + mzd_copy (B, W);
922 +
923 + for ( i=0; i<2048; ++i)
924 + for ( j=0; j<2048/RADIX; ++j){
925 + if (Bbase->values [Bbase->rowswap[i] + j] != Bbasecopy->values [Bbasecopy->rowswap[i] + j]){
926 + status = 1;
927 + }
928 + }
929 + mzd_free_window (U);
930 + mzd_free_window (B);
931 + mzd_free_window (W);
932 + mzd_free(Ubase);
933 + mzd_free(Bbase);
934 + mzd_free(Bbasecopy);
935 +
936 + if (!status)
937 + printf("passed\n");
938 + else
939 + printf("failed\n");
940 + return status;
941 +}
942 +
943 int main(int argc, char **argv) {
944 int status = 0;
945
946 @@ -125,6 +234,19 @@ int main(int argc, char **argv) {
947 status += test_trsm_upper_right(57, 80, 60);
948 printf("UpperRight: larger, odd placed ... ");
949 status += test_trsm_upper_right(1577, 1802, 189);
950 +
951 + printf("LowerRight: small, even placed ... ");
952 + status += test_trsm_lower_right(57, 10, 0);
953 + printf("LowerRight: large, even placed ... ");
954 + status += test_trsm_lower_right(57, 150, 0);
955 + printf("LowerRight: small, odd placed ... ");
956 + status += test_trsm_lower_right(57, 3, 4);
957 + printf("LowerRight: medium, odd placed ... ");
958 + status += test_trsm_lower_right(57, 4, 62);
959 + printf("LowerRight: large, odd placed ... ");
960 + status += test_trsm_lower_right(57, 80, 60);
961 + printf("LowerRight: larger, odd placed ... ");
962 + status += test_trsm_lower_right(1577, 1802, 189);
963
964 printf("LowerLeft: small L even, small B even ... ");
965 status += test_trsm_lower_left (10, 20, 0, 0);
966 @@ -171,7 +293,53 @@ int main(int argc, char **argv) {
967 printf("LowerLeft: larger L odd, larger B odd ... ");
968 status += test_trsm_lower_left (1764, 1345, 198, 123);
969
970 -
971 +
972 + printf("UpperLeft: small U even, small B even ... ");
973 + status += test_trsm_upper_left (10, 20, 0, 0);
974 + printf("UpperLeft: small U even, large B even ... ");
975 + status += test_trsm_upper_left (10, 80, 0, 0);
976 +
977 + printf("UpperLeft: small U even, small B odd ... ");
978 + status += test_trsm_upper_left (10, 20, 0, 15);
979 + printf("UpperLeft: small U even, large B odd ... ");
980 + status += test_trsm_upper_left (10, 80, 0, 15);
981 +
982 + printf("UpperLeft: small U odd, small B even ... ");
983 + status += test_trsm_upper_left (10, 20, 15, 0);
984 + printf("UpperLeft: small U odd, large B even ... ");
985 + status += test_trsm_upper_left (10, 80, 15, 0);
986 +
987 + printf("UpperLeft: small U odd, small B odd ... ");
988 + status += test_trsm_upper_left (10, 20, 15, 20);
989 + printf("UpperLeft: small U odd, large B odd ... ");
990 + status += test_trsm_upper_left (10, 80, 15, 20);
991 +
992 + printf("UpperLeft: large U even, small B even ... ");
993 + status += test_trsm_upper_left (70, 20, 0, 0);
994 + printf("UpperLeft: large U even, large B even ... ");
995 + status += test_trsm_upper_left (70, 80, 0, 0);
996 +
997 + printf("UpperLeft: large U even, small B odd ... ");
998 + status += test_trsm_upper_left (70, 10, 0, 15);
999 + printf("UpperLeft: large U even, large B odd ... ");
1000 + status += test_trsm_upper_left (70, 80, 0, 15);
1001 +
1002 + printf("UpperLeft: large U odd, small B even ... ");
1003 + status += test_trsm_upper_left (70, 20, 15, 0);
1004 + printf("UpperLeft: large U odd, large B even ... ");
1005 + status += test_trsm_upper_left (70, 80, 15, 0);
1006 +
1007 + printf("UpperLeft: large U odd, small B odd ... ");
1008 + status += test_trsm_upper_left (70, 20, 15, 20);
1009 + printf("UpperLeft: large U odd, large B odd ... ");
1010 + status += test_trsm_upper_left (70, 80, 15, 20);
1011 +
1012 + printf("UpperLeft: larger U odd, larger B odd ... ");
1013 + status += test_trsm_upper_left (770, 1600, 75, 89);
1014 + printf("UpperLeft: larger U odd, larger B odd ... ");
1015 + status += test_trsm_upper_left (1764, 1345, 198, 123);
1016 +
1017 +
1018 if (!status) {
1019 printf("All tests passed.\n");
1020 }
Attached Files
To refer to attachments on a page, use attachment:filename, as shown below in the list of files. Do NOT use the URL of the [get] link, since this is subject to change and can break easily.You are not allowed to attach a file to this page.