Attachment 'm4ri_trsm_UL_LR.patch'

Download

   1 # HG changeset patch
   2 # User Clement Pernet <[email protected]>
   3 # Date 1223885528 -7200
   4 # Node ID cafd6739ffedec5f329365aa8d5a9ffb758a55ab
   5 # Parent  7d525bf8b16f4c51f6e45e0b09dda5cdf3eae34c
   6 Added the 2 remaining trsm and the corresponding testsuite and benchmarks.
   7 
   8 diff -r 7d525bf8b16f -r cafd6739ffed src/trsm.c
   9 --- a/src/trsm.c	Sat Oct 11 10:16:18 2008 +0200
  10 +++ b/src/trsm.c	Mon Oct 13 10:12:08 2008 +0200
  11 @@ -24,6 +24,7 @@
  12  #include "parity.h"
  13  #include "stdio.h"
  14  
  15 +/* Upper right case */
  16  void mzd_trsm_upper_right (packedmatrix *U, packedmatrix *B, const int cutoff) {
  17    if(U->nrows != B->ncols)
  18      m4ri_die("mzd_trsm_upper_right: U nrows (%d) need to match B ncols (%d).\n", U->nrows, B->ncols);
  19 @@ -130,52 +131,75 @@ void _mzd_trsm_upper_right_weird (packed
  20    }
  21  }
  22  
  23 +void _mzd_trsm_upper_right_base (packedmatrix* U, packedmatrix* B, const int cutoff){ 
  24 +  
  25 +  size_t mb = B->nrows;
  26 +  size_t nb = B->ncols;
  27 +
  28 +  for (size_t i=1; i < nb; ++i) {
  29 +    /* Computes X_i = B_i + X_{0..i-1} U_{0..i-1,i} */
  30 +    register word ucol = 0;
  31 +    for (size_t k=0; k<i; ++k) {
  32 +      if (GET_BIT (U->values[U->rowswap[k]], i))
  33 +	SET_BIT (ucol, k);
  34 +    }
  35 +    
  36 +    /* doing 64 dotproducts at a time, to use the parity64 parallelism */
  37 +    size_t giantstep;
  38 +    word tmp[64];
  39 +    for (giantstep = 0; giantstep + RADIX < mb; giantstep += RADIX) {
  40 +      for (size_t babystep = 0; babystep < RADIX; ++babystep)
  41 +	tmp [babystep] = B->values [B->rowswap [babystep + giantstep]] & ucol;
  42 +      
  43 +      word dotprod = parity64 (tmp);
  44 +      
  45 +      for (size_t babystep = 0; babystep < RADIX; ++babystep)
  46 +	if (GET_BIT (dotprod, babystep))
  47 +	  FLIP_BIT (B->values [B->rowswap [giantstep + babystep]], i);
  48 +    }
  49 +    
  50 +    for (size_t babystep = 0; babystep < mb - giantstep; ++babystep)
  51 +      tmp [babystep] = B->values [B->rowswap [babystep + giantstep]] & ucol;
  52 +    for (size_t babystep = mb-giantstep; babystep < 64; ++babystep)
  53 +      tmp [babystep] = 0;
  54 +    
  55 +    word dotprod = parity64 (tmp);
  56 +    for (size_t babystep = 0; babystep < mb - giantstep; ++babystep)
  57 +      if (GET_BIT (dotprod, babystep))
  58 +	FLIP_BIT (B->values [B->rowswap [giantstep + babystep ]], i);
  59 +  }
  60 +}
  61 +/* void _mzd_trsm_upper_right_base (packedmatrix* U, packedmatrix* B, const int cutoff){ */
  62 +/*   size_t mb = B->nrows; */
  63 +/*   size_t nb = B->ncols; */
  64 +/*   for (size_t i=1; i < nb; ++i) { */
  65 +/*     /\* Computes X_i = B_i + X_{0..i-1} U_{0..i-1,i} *\/ */
  66 +/*     packedmatrix *Xm = mzd_init_window(B,  0,     0,   mb, i); */
  67 +/*     packedmatrix *Bm = mzd_init_window(B,  0,     i,   mb, i+1); */
  68 +/*     packedmatrix *Um = mzd_init_window(U,  0,     i,   i, i+1); */
  69 +/*     _mzd_addmul(Bm, Xm, Um, cutoff); */
  70 +/*     mzd_free_window(Xm); */
  71 +/*     mzd_free_window(Um); */
  72 +/*     mzd_free_window(Bm); */
  73 +/*   } */
  74 +/* } */
  75 +
  76  /**
  77   * Variant where U and B start at an even bit position
  78   * Assumes that U->ncols < 64
  79   */
  80 +#define TRSM_THRESHOLD 64
  81 +
  82  void _mzd_trsm_upper_right_even (packedmatrix *U, packedmatrix *B, const int cutoff) {
  83  
  84    size_t mb = B->nrows;
  85    size_t nb = B->ncols;
  86 -    
  87 -  if (nb <= RADIX){
  88 +  
  89 +  if (nb <= TRSM_THRESHOLD){
  90      /* base case */
  91 -    for (size_t i=1; i < nb; ++i) {
  92 -
  93 -      /* Computes X_i = B_i + X_{0..i-1} U_{0..i-1,i} */
  94 -
  95 -      register word ucol = 0;
  96 -      for (size_t k=0; k<i; ++k) {
  97 -	if (GET_BIT (U->values[U->rowswap[k]], i))
  98 -	  SET_BIT (ucol, k);
  99 -      }
 100 -
 101 -      /* doing 64 dotproducts at a time, to use the parity64 parallelism */
 102 -      size_t giantstep;
 103 -      word tmp[64];
 104 -      for (giantstep = 0; giantstep + RADIX < mb; giantstep += RADIX) {
 105 -	for (size_t babystep = 0; babystep < RADIX; ++babystep)
 106 -	  tmp [babystep] = B->values [B->rowswap [babystep + giantstep]] & ucol;
 107 -
 108 -	word dotprod = parity64 (tmp);
 109 -
 110 -	for (size_t babystep = 0; babystep < RADIX; ++babystep)
 111 -	  if (GET_BIT (dotprod, babystep))
 112 -	    FLIP_BIT (B->values [B->rowswap [giantstep + babystep]], i);
 113 -      }  
 114 -      
 115 -      for (size_t babystep = 0; babystep < mb - giantstep; ++babystep)
 116 -	tmp [babystep] = B->values [B->rowswap [babystep + giantstep]] & ucol;
 117 -      for (size_t babystep = mb-giantstep; babystep < 64; ++babystep)
 118 -	tmp [babystep] = 0;
 119 -
 120 -      word dotprod = parity64 (tmp);
 121 -      for (size_t babystep = 0; babystep < mb - giantstep; ++babystep)
 122 -	if (GET_BIT (dotprod, babystep))
 123 -	  FLIP_BIT (B->values [B->rowswap [giantstep + babystep ]], i);
 124 -    }
 125 -  } else {
 126 +    _mzd_trsm_upper_right_base (U, B, cutoff);
 127 +  }
 128 +  else {
 129      size_t nb1 = (((nb-1) / RADIX + 1) >> 1) * RADIX;
 130  
 131      packedmatrix *B0 = mzd_init_window(B,  0,     0,   mb, nb1);
 132 @@ -196,6 +220,208 @@ void _mzd_trsm_upper_right_even (packedm
 133      mzd_free_window(U11);
 134    }
 135  }
 136 +
 137 +/* Lower right case */
 138 +void mzd_trsm_lower_right (packedmatrix *L, packedmatrix *B, const int cutoff) {
 139 +  if(L->nrows != B->ncols)
 140 +    m4ri_die("mzd_trsm_lower_right: L nrows (%d) need to match B ncols (%d).\n", L->nrows, B->ncols);
 141 +  if(L->nrows != L->ncols)
 142 +    m4ri_die("mzd_trsm_lower_right: L must be square and is found to be (%d) x (%d).\n", L->nrows, L->ncols);
 143 +  
 144 +  if (cutoff <= 0)
 145 +    m4ri_die("mzd_trsm_lower_right: cutoff must be > 0.\n");
 146 +
 147 +  _mzd_trsm_lower_right (L, B, cutoff);
 148 +}
 149 +
 150 +void _mzd_trsm_lower_right (packedmatrix *L, packedmatrix *B, const int cutoff) {
 151 +  size_t nb = B->ncols;
 152 +  size_t mb = B->nrows;
 153 +  size_t n1 = RADIX-B->offset;
 154 +  if (nb <= n1)
 155 +    _mzd_trsm_lower_right_weird (L, B, cutoff);
 156 +  else{
 157 +  /**
 158 +   \verbatim  
 159 +     
 160 +     |\
 161 +     | \  
 162 +     |  \
 163 +     |L00\
 164 +     |____\
 165 +     |    |\  
 166 +     |    | \
 167 +     |    |  \
 168 +     |L10 |L11\ 
 169 +     |____|____\
 170 +      _________
 171 +     |B0  |B1  |
 172 +     |____|____|
 173 +   \endverbatim
 174 + 
 175 +   * L00 and B0 are possibly located at uneven locations.
 176 +   * Their column dimension is lower than 64
 177 +   * The first column of L10, L11, B1 are aligned to words.
 178 +   */
 179 +    packedmatrix *B0  = mzd_init_window (B,  0,  0, mb, n1);
 180 +    packedmatrix *B1  = mzd_init_window (B,  0, n1, mb, nb);
 181 +    packedmatrix *L00 = mzd_init_window (L,  0,  0, n1, n1);
 182 +    packedmatrix *L10 = mzd_init_window (L,  n1, 0, nb, n1);
 183 +    packedmatrix *L11 = mzd_init_window (L, n1, n1, nb, nb);
 184 +    
 185 +    _mzd_trsm_lower_right_even (L11, B1, cutoff);
 186 +    mzd_addmul (B0, B1, L10, cutoff);
 187 +    _mzd_trsm_lower_right_weird (L00, B0, cutoff);
 188 +    
 189 +    mzd_free_window(B0);
 190 +    mzd_free_window(B1);
 191 +    
 192 +    mzd_free_window(L00);
 193 +    mzd_free_window(L10);
 194 +    mzd_free_window(L11);
 195 +  }
 196 +}
 197 +
 198 +/**
 199 + * Variant where L and B start at an odd bit position
 200 + * Assumes that L->ncols < 64
 201 + */
 202 +void _mzd_trsm_lower_right_weird (packedmatrix *L, packedmatrix *B, const int cutoff) {
 203 +
 204 +  size_t mb = B->nrows;
 205 +  size_t nb = B->ncols;
 206 +  size_t offset = B->offset;
 207 +  
 208 +  for (int i=nb-1; i >=0; --i) {
 209 +    
 210 +    /* Computes X_i = B_i + X_{i+1,n} L_{i+1..n,i} */
 211 +    
 212 +    register word ucol = 0;
 213 +    for (size_t k=i+1; k<nb; ++k) {
 214 +      if (GET_BIT (L->values[L->rowswap[k]], i + L->offset))
 215 +	SET_BIT (ucol, k+offset);
 216 +    }
 217 +    /* doing 64 dotproducts at a time, to use the parity64 parallelism */
 218 +    size_t giantstep;
 219 +    word tmp[64];
 220 +    for (giantstep = 0; giantstep + RADIX < mb; giantstep += RADIX) {
 221 +      for (size_t babystep = 0; babystep < RADIX; ++babystep)
 222 +	tmp [babystep] = B->values [B->rowswap [babystep + giantstep]] & ucol;
 223 +
 224 +      word dotprod = parity64 (tmp);
 225 +      
 226 +      for (size_t babystep = 0; babystep < RADIX; ++babystep)
 227 +	  if (GET_BIT (dotprod, babystep))
 228 +	    FLIP_BIT (B->values [B->rowswap [giantstep + babystep]], i + offset);
 229 +      }      
 230 +    for (size_t babystep = 0; babystep < mb - giantstep; ++babystep){
 231 +      tmp [babystep] = B->values [B->rowswap [babystep + giantstep]] & ucol;
 232 +
 233 +    }
 234 +    for (size_t babystep = mb-giantstep; babystep < 64; ++babystep){
 235 +      tmp [babystep] = 0;
 236 +    }
 237 +
 238 +    word dotprod = parity64 (tmp);
 239 +    
 240 +    for (size_t babystep = 0; babystep < mb - giantstep; ++babystep)
 241 +      if (GET_BIT (dotprod, babystep))
 242 +	FLIP_BIT (B->values [B->rowswap [giantstep + babystep ]], i + offset);
 243 +  }
 244 +}
 245 +
 246 +void _mzd_trsm_lower_right_base (packedmatrix* L, packedmatrix* B, const int cutoff){ 
 247 +  
 248 +  size_t mb = B->nrows;
 249 +  size_t nb = B->ncols;
 250 +
 251 +  for (int i=nb-1; i >=0; --i) {
 252 +    
 253 +    /* Computes X_i = B_i + X_{i+1,n} L_{i+1..n,i} */
 254 +    
 255 +    register word ucol = 0;
 256 +    for (size_t k=i+1; k<nb; ++k) {
 257 +      if (GET_BIT (L->values[L->rowswap[k]], i))
 258 +	SET_BIT (ucol, k);
 259 +    }
 260 +    
 261 +    /* doing 64 dotproducts at a time, to use the parity64 parallelism */
 262 +    size_t giantstep;
 263 +    word tmp[64];
 264 +    for (giantstep = 0; giantstep + RADIX < mb; giantstep += RADIX) {
 265 +      for (size_t babystep = 0; babystep < RADIX; ++babystep)
 266 +	tmp [babystep] = B->values [B->rowswap [babystep + giantstep]] & ucol;
 267 +      
 268 +      word dotprod = parity64 (tmp);
 269 +      
 270 +      for (size_t babystep = 0; babystep < RADIX; ++babystep)
 271 +	if (GET_BIT (dotprod, babystep))
 272 +	  FLIP_BIT (B->values [B->rowswap [giantstep + babystep]], i);
 273 +    }
 274 +    
 275 +    for (size_t babystep = 0; babystep < mb - giantstep; ++babystep)
 276 +      tmp [babystep] = B->values [B->rowswap [babystep + giantstep]] & ucol;
 277 +    for (size_t babystep = mb-giantstep; babystep < 64; ++babystep)
 278 +      tmp [babystep] = 0;
 279 +    
 280 +    word dotprod = parity64 (tmp);
 281 +    for (size_t babystep = 0; babystep < mb - giantstep; ++babystep)
 282 +      if (GET_BIT (dotprod, babystep))
 283 +	FLIP_BIT (B->values [B->rowswap [giantstep + babystep ]], i);
 284 +  }
 285 +}
 286 +/* void _mzd_trsm_lower_right_base (packedmatrix* L, packedmatrix* B, const int cutoff){ */
 287 +/*   size_t mb = B->nrows; */
 288 +/*   size_t nb = B->ncols; */
 289 +/*   for (size_t i=1; i < nb; ++i) { */
 290 +/*     /\* Computes X_i = B_i + X_{0..i-1} L_{0..i-1,i} *\/ */
 291 +/*     packedmatrix *Xm = mzd_init_window(B,  0,     0,   mb, i); */
 292 +/*     packedmatrix *Bm = mzd_init_window(B,  0,     i,   mb, i+1); */
 293 +/*     packedmatrix *Lm = mzd_init_window(L,  0,     i,   i, i+1); */
 294 +/*     _mzd_addmul(Bm, Xm, Lm, cutoff); */
 295 +/*     mzd_free_window(Xm); */
 296 +/*     mzd_free_window(Lm); */
 297 +/*     mzd_free_window(Bm); */
 298 +/*   } */
 299 +/* } */
 300 +
 301 +/**
 302 + * Variant where L and B start at an even bit position
 303 + * Assumes that L->ncols < 64
 304 + */
 305 +#define TRSM_THRESHOLD 64
 306 +
 307 +void _mzd_trsm_lower_right_even (packedmatrix *L, packedmatrix *B, const int cutoff) {
 308 +
 309 +  size_t mb = B->nrows;
 310 +  size_t nb = B->ncols;
 311 +  
 312 +  if (nb <= TRSM_THRESHOLD){
 313 +    /* base case */
 314 +    _mzd_trsm_lower_right_base (L, B, cutoff);
 315 +  }
 316 +  else {
 317 +    size_t nb1 = (((nb-1) / RADIX + 1) >> 1) * RADIX;
 318 +
 319 +    packedmatrix *B0 = mzd_init_window(B,  0,     0,   mb, nb1);
 320 +    packedmatrix *B1 = mzd_init_window(B,  0,   nb1,   mb, nb);
 321 +    packedmatrix *L00 = mzd_init_window(L, 0,     0, nb1, nb1);
 322 +    packedmatrix *L10 = mzd_init_window(L, nb1, 0, nb, nb1);
 323 +    packedmatrix *L11 = mzd_init_window(L, nb1, nb1,  nb, nb);
 324 +
 325 +    _mzd_trsm_lower_right_even (L11, B1, cutoff);
 326 +    mzd_addmul (B0, B1, L10, cutoff);
 327 +    _mzd_trsm_lower_right_even (L00, B0, cutoff);
 328 +
 329 +    mzd_free_window(B0);
 330 +    mzd_free_window(B1);
 331 +
 332 +    mzd_free_window(L00);
 333 +    mzd_free_window(L10);
 334 +    mzd_free_window(L11);
 335 +  }
 336 +}
 337 +
 338  
 339  /* Lower Left Implementations */
 340  
 341 @@ -236,7 +462,7 @@ void _mzd_trsm_lower_left (packedmatrix 
 342         * |L10 |L11\  |      |
 343         * |____|____\ |______|
 344         * 
 345 -       * L00 L10 and B0 are possibly located at uneven locations.
 346 +       * L00 L10 B0 and B1 are possibly located at uneven locations.
 347         * Their column dimension is lower than 64
 348         * The first column of L01, L11, B1 are aligned to words.
 349         */
 350 @@ -279,6 +505,7 @@ void _mzd_trsm_lower_left_weird (packedm
 351        mask_begin = ~mask_begin;
 352      word mask_end = LEFT_BITMASK(nbrest);
 353  
 354 +    // L[0,0] = 1, so no work required for i=0
 355      for (size_t i=1; i < mb; ++i) {
 356        
 357        /* Computes X_i = B_i + L_{i,0..i-1} X_{0..i-1}  */
 358 @@ -390,3 +617,201 @@ void _mzd_trsm_lower_left_even (packedma
 359      mzd_free_window(L11);
 360    }
 361  }
 362 +
 363 +
 364 +
 365 +/* Upper Left Implementations */
 366 +
 367 +void mzd_trsm_upper_left (packedmatrix *U, packedmatrix *B, const int cutoff) {
 368 +  if(U->ncols != B->nrows)
 369 +    m4ri_die("mzd_trsm_upper_left: U ncols (%d) need to match B nrows (%d).\n", U->ncols, B->nrows);
 370 +  if(U->nrows != U->ncols)
 371 +    m4ri_die("mzd_trsm_upper_left: U must be square and is found to be (%d) x (%d).\n", U->nrows, U->ncols);
 372 +  
 373 +  if (cutoff <= 0)
 374 +    m4ri_die("mzd_trsm_upper_left: cutoff must be > 0.\n");
 375 +
 376 +  _mzd_trsm_upper_left (U, B, cutoff);
 377 +}
 378 +
 379 +void _mzd_trsm_upper_left (packedmatrix *U, packedmatrix *B, const int cutoff) {
 380 +
 381 +
 382 +  if (!U->offset)
 383 +    _mzd_trsm_upper_left_even (U, B, cutoff);
 384 +  else{
 385 +    size_t nb = B->ncols;
 386 +    size_t mb = B->nrows;
 387 +    size_t m1 = RADIX - U->offset;
 388 +    if (mb <= m1)
 389 +      _mzd_trsm_upper_left_weird (U, B, cutoff);
 390 +    else{
 391 +      /**
 392 +       *  __________   ______
 393 +       *  \ U00|    | |      |
 394 +       *   \   |U01 | |      |
 395 +       *    \  |    | |  B0  |
 396 +       *     \ |    | |      |
 397 +       *      \|____| |______|
 398 +       *       \    | |      |
 399 +       *        \U11| |      |
 400 +       *         \  | |  B1  |
 401 +       *          \ | |      |
 402 +       *           \| |______|
 403 +       * 
 404 +       * U00, B0 and B1 are possibly located at uneven locations.
 405 +       * Their column dimension is greater than 64
 406 +       * The first column of U01, U11, B0 and B1 are aligned to words.
 407 +       */
 408 +      
 409 +      packedmatrix *B0  = mzd_init_window (B,  0,  0, m1, nb);
 410 +      packedmatrix *B1  = mzd_init_window (B,  m1, 0, mb, nb);
 411 +      packedmatrix *U00 = mzd_init_window (U,  0,  0, m1, m1);
 412 +      packedmatrix *U01 = mzd_init_window (U,  0, m1, m1, mb);
 413 +      packedmatrix *U11 = mzd_init_window (U, m1, m1, mb, mb);
 414 +      
 415 +      _mzd_trsm_upper_left_even (U11, B1, cutoff);
 416 +      mzd_addmul (B0, U01, B1, cutoff);
 417 +      _mzd_trsm_upper_left_weird (U00, B0, cutoff);
 418 +    
 419 +      mzd_free_window(B0);
 420 +      mzd_free_window(B1);
 421 +      
 422 +      mzd_free_window(U00);
 423 +      mzd_free_window(U01);
 424 +      mzd_free_window(U11);
 425 +    }
 426 +  }
 427 +}
 428 +
 429 +/**
 430 + * Variant where U and B start at an odd bit position
 431 + * Assumes that U->ncols < 64
 432 + */
 433 +void _mzd_trsm_upper_left_weird (packedmatrix *U, packedmatrix *B, const int cutoff) {
 434 +
 435 +  size_t mb = B->nrows;
 436 +  size_t nb = B->ncols;
 437 +  size_t Boffset = B->offset;
 438 +  size_t nbrest = (nb + Boffset) % RADIX;
 439 +  if (nb + Boffset >= RADIX) {
 440 +
 441 +    // Large B
 442 +    word mask_begin = RIGHT_BITMASK(RADIX-B->offset);
 443 +    if (B->offset == 0)
 444 +      mask_begin = ~mask_begin;
 445 +    word mask_end = LEFT_BITMASK(nbrest);
 446 +
 447 +    // U[mb-1,mb-1] = 1, so no work required for i=mb-1
 448 +    for (int i=mb-2; i >= 0; --i) {
 449 +      
 450 +      /* Computes X_i = B_i + U_{i,i+1..mb} X_{i+1..mb}  */
 451 +      size_t Uidx = U->rowswap[i];
 452 +      size_t Bidx = B->rowswap[i];
 453 +
 454 +      for (size_t k=i+1; k<mb; ++k) {
 455 +	if (GET_BIT (U->values [Uidx], k + U->offset)){
 456 +	  B->values [Bidx] ^= B->values [B->rowswap [k]] & mask_begin;
 457 +	  for (size_t j = 1; j < B->width-1; ++j)
 458 +	    B->values [Bidx + j] ^= B->values [B->rowswap [k] + j];
 459 +	  B->values [Bidx + B->width - 1] ^= B->values [B->rowswap [k] + B->width - 1] & mask_end;
 460 +	}
 461 +      }
 462 +    }
 463 +  } else { // Small B
 464 +
 465 +    word mask = ((ONE << nb) - 1) ;
 466 +    mask <<= (RADIX-nb-B->offset);
 467 +
 468 +    // U[mb-1,mb-1] = 1, so no work required for i=mb-1
 469 +    for (int i=mb-2; i >= 0; --i) {
 470 +      /* Computes X_i = B_i + U_{i,i+1..mb} X_{i+1..mb}  */
 471 +      size_t Uidx = U->rowswap[i];
 472 +      size_t Bidx = B->rowswap[i];
 473 +
 474 +      for (size_t k=i+1; k<mb; ++k) {
 475 +	if (GET_BIT (U->values [Uidx], k + U->offset)){
 476 +	  B->values [Bidx] ^= B->values [B->rowswap [k]] & mask;
 477 +	}
 478 +      }
 479 +    }
 480 +  }
 481 +}
 482 +
 483 +/**
 484 + * Variant where U and B start at an odd bit position
 485 + * Assumes that U->ncols < 64
 486 + */
 487 +void _mzd_trsm_upper_left_even (packedmatrix *U, packedmatrix *B, const int cutoff) {
 488 +
 489 +  size_t mb = B->nrows;
 490 +  size_t nb = B->ncols;
 491 +  size_t Boffset = B->offset;
 492 +  size_t nbrest = (nb + Boffset) % RADIX;
 493 +
 494 +  if (mb <= RADIX){
 495 +    /* base case */
 496 +
 497 +    if (nb + B->offset >= RADIX) {
 498 +      // B is large
 499 +      word mask_begin = RIGHT_BITMASK(RADIX-B->offset);
 500 +      if (B->offset == 0)
 501 +        mask_begin = ~mask_begin;
 502 +      word mask_end = LEFT_BITMASK(nbrest);
 503 +
 504 +      // U[mb-1,mb-1] = 1, so no work required for i=mb-1
 505 +      for (int i=mb-2; i >= 0; --i) {
 506 +
 507 +	/* Computes X_i = B_i + U_{i,i+1..mb} X_{i+1..mb}  */
 508 +	size_t Uidx = U->rowswap[i];
 509 +	size_t Bidx = B->rowswap[i];
 510 +
 511 +	for (size_t k=i+1; k<mb; ++k) {
 512 +	  if (GET_BIT (U->values [Uidx], k)){
 513 +	    B->values [Bidx] ^= B->values [B->rowswap [k]] & mask_begin;
 514 +	    for (size_t j = 1; j < B->width-1; ++j)
 515 +	      B->values [Bidx + j] ^= B->values [B->rowswap [k] + j];
 516 +	    B->values [Bidx + B->width - 1] ^= B->values [B->rowswap [k] + B->width - 1] & mask_end;
 517 +	  }
 518 +	}
 519 +      }
 520 +    } else { // B is small
 521 +      word mask = ((ONE << nb) - 1) ;
 522 +      mask <<= (RADIX-nb-B->offset);
 523 +      // U[mb-1,mb-1] = 1, so no work required for i=mb-1
 524 +      for (int i=mb-2; i >= 0; --i) {
 525 +
 526 +	/* Computes X_i = B_i + U_{i,i+1..mb} X_{i+1..mb}  */
 527 +	size_t Uidx = U->rowswap [i];
 528 +	size_t Bidx = B->rowswap [i];
 529 +
 530 +	for (size_t k=i+1; k<mb; ++k) {
 531 +	  if (GET_BIT (U->values [Uidx], k)){
 532 +	    B->values [Bidx] ^= B->values [B->rowswap[k]] & mask;
 533 +	  }
 534 +	}
 535 +      }
 536 +    }
 537 +  } else {
 538 +    size_t mb1 = (((mb-1) / RADIX + 1) >> 1) * RADIX;
 539 +
 540 +    packedmatrix *B0 = mzd_init_window(B,  0,     0,   mb1, nb);
 541 +    packedmatrix *B1 = mzd_init_window(B, mb1,    0,   mb,  nb);
 542 +    packedmatrix *U00 = mzd_init_window(U, 0,     0, mb1, mb1);
 543 +    packedmatrix *U01 = mzd_init_window(U, 0,   mb1, mb1, mb);
 544 +    packedmatrix *U11 = mzd_init_window(U, mb1, mb1, mb, mb);
 545 +
 546 +    _mzd_trsm_upper_left_even (U11, B1, cutoff);
 547 +
 548 +    _mzd_addmul (B0, U01, B1, cutoff);
 549 +
 550 +    _mzd_trsm_upper_left_even (U00, B0, cutoff);
 551 +
 552 +    mzd_free_window(B0);
 553 +    mzd_free_window(B1);
 554 +
 555 +    mzd_free_window(U00);
 556 +    mzd_free_window(U01);
 557 +    mzd_free_window(U11);
 558 +  }
 559 +}
 560 diff -r 7d525bf8b16f -r cafd6739ffed src/trsm.h
 561 --- a/src/trsm.h	Sat Oct 11 10:16:18 2008 +0200
 562 +++ b/src/trsm.h	Mon Oct 13 10:12:08 2008 +0200
 563 @@ -75,6 +75,43 @@ void _mzd_trsm_upper_right_weird (packed
 564  /**
 565   * \brief TRiangular System solving with matrix.
 566   *
 567 + * Solves X L = B where X and B are matrices, and L is lower triangular.
 568 + * X is stored inplace on B
 569 + * * 
 570 + * This is the wrapper function including bounds checks. See
 571 + * _mzd_trsm_upper_right for implementation details.
 572 + *
 573 + * \param L Input upper triangular matrix.
 574 + * \param B Input matrix, being overwritten by the solution matrix X
 575 + * \param cutoff Minimal dimension for Strassen recursion.
 576 + *
 577 + */
 578 +
 579 +void mzd_trsm_lower_right (packedmatrix *L, packedmatrix *B, const int cutoff);
 580 +
 581 +/**
 582 + * \brief TRiangular System solving with matrix.
 583 + *
 584 + * Solves X L = B where X and B are matrices, and L is lower triangular.
 585 + * This version assumes that the matrices are at an even position on
 586 + * the RADIX grid and that their dimension is a multiple of RADIX.
 587 + * X is stored inplace on B
 588 + *
 589 + * \param L Input lower triangular matrix.
 590 + * \param B Input matrix, being overwritten by the solution matrix X
 591 + * \param cutoff Minimal dimension for Strassen recursion.
 592 + *
 593 + * \internal
 594 + */
 595 +void _mzd_trsm_lower_right (packedmatrix *L, packedmatrix *B, const int cutoff);
 596 +
 597 +void _mzd_trsm_lower_right_even (packedmatrix *L, packedmatrix *B, const int cutoff);
 598 +
 599 +void _mzd_trsm_lower_right_weird (packedmatrix *L, packedmatrix *B, const int cutoff);
 600 +
 601 +/**
 602 + * \brief TRiangular System solving with matrix.
 603 + *
 604   * Solves L X = B where X and B are matrices, and L is lower triangular.
 605   *  X is stored inplace on B
 606   *  
 607 @@ -107,4 +144,40 @@ void _mzd_trsm_lower_left_even (packedma
 608  
 609  void _mzd_trsm_lower_left_weird (packedmatrix *L, packedmatrix *B, const int cutoff);
 610  
 611 +
 612 +/**
 613 + * \brief TRiangular System solving with matrix.
 614 + *
 615 + * Solves U X = B where X and B are matrices, and U is upper triangular.
 616 + *  X is stored inplace on B
 617 + *  
 618 + * This is the wrapper function including bounds checks. See
 619 + * _mzd_trsm_upper_left for implementation details.
 620 + *
 621 + * \param U Input upper triangular matrix.
 622 + * \param B Input matrix, being overwritten by the solution matrix X
 623 + * \param cutoff Minimal dimension for Strassen recursion.
 624 + *
 625 + */
 626 +
 627 +void mzd_trsm_upper_left (packedmatrix *U, packedmatrix *B, const int cutoff);
 628 +
 629 +/**
 630 + * \brief TRiangular System solving with matrix.
 631 + *
 632 + * Solves U X = B where X and B are matrices, and U is upper triangular.
 633 + * X is stored inplace on B
 634 + *
 635 + * \param U Input upper triangular matrix.
 636 + * \param B Input matrix, being overwritten by the solution matrix X
 637 + * \param cutoff Minimal dimension for Strassen recursion.
 638 + *
 639 + * \internal
 640 + */
 641 +void _mzd_trsm_upper_left (packedmatrix *U, packedmatrix *B, const int cutoff);
 642 +
 643 +void _mzd_trsm_upper_left_even (packedmatrix *U, packedmatrix *B, const int cutoff);
 644 +
 645 +void _mzd_trsm_upper_left_weird (packedmatrix *U, packedmatrix *B, const int cutoff);
 646 +
 647  #endif
 648 diff -r 7d525bf8b16f -r cafd6739ffed testsuite/Makefile
 649 --- a/testsuite/Makefile	Sat Oct 11 10:16:18 2008 +0200
 650 +++ b/testsuite/Makefile	Mon Oct 13 10:12:08 2008 +0200
 651 @@ -4,7 +4,7 @@ DEBUG=-ggdb
 652  
 653  
 654  
 655 -PRGS=test_elimination test_multiplication bench_elimination bench_multiplication bench_addition test_trsm
 656 +PRGS=test_elimination test_multiplication bench_elimination bench_multiplication bench_addition test_trsm bench_trsm_lowerleft bench_trsm_upperright bench_trsm_upperleft bench_trsm_lowerright
 657  
 658  CPUCYCLES_DIR=./cpucycles-20060326
 659  
 660 diff -r 7d525bf8b16f -r cafd6739ffed testsuite/bench_trsm_lowerleft.c
 661 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
 662 +++ b/testsuite/bench_trsm_lowerleft.c	Mon Oct 13 10:12:08 2008 +0200
 663 @@ -0,0 +1,36 @@
 664 +#include <stdlib.h>
 665 +
 666 +#include "cpucycles.h"
 667 +#include "walltime.h"
 668 +#include "m4ri.h"
 669 +
 670 +int main(int argc, char **argv) {
 671 +  int n,m;
 672 +  unsigned long long t;
 673 +  double wt;
 674 +  double clockZero = 0.0;
 675 +
 676 +  if (argc != 3) {
 677 +    m4ri_die("Parameters m, n expected.\n");
 678 +  }
 679 +  m = atoi(argv[1]);
 680 +  n = atoi(argv[2]);
 681 +  packedmatrix *B = mzd_init(m, n);
 682 +  packedmatrix *L = mzd_init(m, m);
 683 +  mzd_randomize(B);
 684 +  mzd_randomize(L);
 685 +  size_t i,j;
 686 +  for (i=0; i<n; ++i){
 687 +    for (j=i+1; j<n;++j)
 688 +      mzd_write_bit(L,i,j, 0);
 689 +    mzd_write_bit(L,i,i, 1);
 690 +  }
 691 +  
 692 +  wt = walltime(&clockZero);
 693 +  t = cpucycles();
 694 +  mzd_trsm_lower_left(L, B, 2048);
 695 +  printf("n: %5d, cpu cycles: %llu wall time: %lf\n",n, cpucycles() - t, walltime(&wt));
 696 +
 697 +  mzd_free(B);
 698 +  mzd_free(L);
 699 +}
 700 diff -r 7d525bf8b16f -r cafd6739ffed testsuite/bench_trsm_lowerright.c
 701 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
 702 +++ b/testsuite/bench_trsm_lowerright.c	Mon Oct 13 10:12:08 2008 +0200
 703 @@ -0,0 +1,36 @@
 704 +#include <stdlib.h>
 705 +
 706 +#include "cpucycles.h"
 707 +#include "walltime.h"
 708 +#include "m4ri.h"
 709 +
 710 +int main(int argc, char **argv) {
 711 +  int n,m;
 712 +  unsigned long long t;
 713 +  double wt;
 714 +  double clockZero = 0.0;
 715 +
 716 +  if (argc != 3) {
 717 +    m4ri_die("Parameters m, n expected.\n");
 718 +  }
 719 +  m = atoi(argv[1]);
 720 +  n = atoi(argv[2]);
 721 +  packedmatrix *B = mzd_init(m, n);
 722 +  packedmatrix *L = mzd_init(n, n);
 723 +  mzd_randomize(B);
 724 +  mzd_randomize(L);
 725 +  size_t i,j;
 726 +  for (i=0; i<n; ++i){
 727 +    for (j=i+1; j<n;++j)
 728 +      mzd_write_bit(L,i,j, 0);
 729 +    mzd_write_bit(L,i,i, 1);
 730 +  }
 731 +  
 732 +  wt = walltime(&clockZero);
 733 +  t = cpucycles();
 734 +  mzd_trsm_lower_right(L, B, 2048);
 735 +  printf("n: %5d, cpu cycles: %llu wall time: %lf\n",n, cpucycles() - t, walltime(&wt));
 736 +
 737 +  mzd_free(B);
 738 +  mzd_free(L);
 739 +}
 740 diff -r 7d525bf8b16f -r cafd6739ffed testsuite/bench_trsm_upperleft.c
 741 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
 742 +++ b/testsuite/bench_trsm_upperleft.c	Mon Oct 13 10:12:08 2008 +0200
 743 @@ -0,0 +1,36 @@
 744 +#include <stdlib.h>
 745 +
 746 +#include "cpucycles.h"
 747 +#include "walltime.h"
 748 +#include "m4ri.h"
 749 +
 750 +int main(int argc, char **argv) {
 751 +  int n,m;
 752 +  unsigned long long t;
 753 +  double wt;
 754 +  double clockZero = 0.0;
 755 +
 756 +  if (argc != 3) {
 757 +    m4ri_die("Parameters m, n expected.\n");
 758 +  }
 759 +  m = atoi(argv[1]);
 760 +  n = atoi(argv[2]);
 761 +  packedmatrix *B = mzd_init(m, n);
 762 +  packedmatrix *U = mzd_init(m, m);
 763 +  mzd_randomize(B);
 764 +  mzd_randomize(U);
 765 +  size_t i,j;
 766 +  for (i=0; i<n; ++i){
 767 +    for (j=0; j<i;++j)
 768 +      mzd_write_bit(U,i,j, 0);
 769 +    mzd_write_bit(U,i,i, 1);
 770 +  }
 771 +  
 772 +  wt = walltime(&clockZero);
 773 +  t = cpucycles();
 774 +  mzd_trsm_upper_left(U, B, 2048);
 775 +  printf("n: %5d, cpu cycles: %llu wall time: %lf\n",n, cpucycles() - t, walltime(&wt));
 776 +
 777 +  mzd_free(B);
 778 +  mzd_free(U);
 779 +}
 780 diff -r 7d525bf8b16f -r cafd6739ffed testsuite/bench_trsm_upperright.c
 781 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
 782 +++ b/testsuite/bench_trsm_upperright.c	Mon Oct 13 10:12:08 2008 +0200
 783 @@ -0,0 +1,36 @@
 784 +#include <stdlib.h>
 785 +
 786 +#include "cpucycles.h"
 787 +#include "walltime.h"
 788 +#include "m4ri.h"
 789 +
 790 +int main(int argc, char **argv) {
 791 +  int n,m;
 792 +  unsigned long long t;
 793 +  double wt;
 794 +  double clockZero = 0.0;
 795 +
 796 +  if (argc != 3) {
 797 +    m4ri_die("Parameters m, n expected.\n");
 798 +  }
 799 +  m = atoi(argv[1]);
 800 +  n = atoi(argv[2]);
 801 +  packedmatrix *B = mzd_init(m, n);
 802 +  packedmatrix *U = mzd_init(n, n);
 803 +  mzd_randomize(B);
 804 +  mzd_randomize(U);
 805 +  size_t i,j;
 806 +  for (i=0; i<n; ++i){
 807 +    for (j=0; j<i;++j)
 808 +      mzd_write_bit(U,i,j, 0);
 809 +    mzd_write_bit(U,i,i, 1);
 810 +  }
 811 +  
 812 +  wt = walltime(&clockZero);
 813 +  t = cpucycles();
 814 +  mzd_trsm_upper_right(U, B, 2048);
 815 +  printf("n: %5d, cpu cycles: %llu wall time: %lf\n",n, cpucycles() - t, walltime(&wt));
 816 +
 817 +  mzd_free(B);
 818 +  mzd_free(U);
 819 +}
 820 diff -r 7d525bf8b16f -r cafd6739ffed testsuite/test_trsm.c
 821 --- a/testsuite/test_trsm.c	Sat Oct 11 10:16:18 2008 +0200
 822 +++ b/testsuite/test_trsm.c	Mon Oct 13 10:12:08 2008 +0200
 823 @@ -45,6 +45,59 @@ int test_trsm_upper_right (int m, int n,
 824    mzd_free_window (B);
 825    mzd_free (W);
 826    mzd_free(Ubase);
 827 +  mzd_free(Bbase);
 828 +  mzd_free(Bbasecopy);
 829 +
 830 +  if (!status)
 831 +    printf("passed\n");
 832 +  else
 833 +    printf("failed\n");
 834 +  return status;
 835 +}
 836 +
 837 +int test_trsm_lower_right (int m, int n, int offset){
 838 +  packedmatrix* Lbase = mzd_init (2048,2048);
 839 +  packedmatrix* Bbase = mzd_init (2048,2048);
 840 +  mzd_randomize (Lbase);
 841 +  mzd_randomize (Bbase);
 842 +  packedmatrix* Bbasecopy = mzd_copy (NULL, Bbase);
 843 +
 844 +  packedmatrix* L = mzd_init_window (Lbase, 0, offset, n, offset + n);
 845 +  packedmatrix* B = mzd_init_window (Bbase, 0, offset, m, offset + n);
 846 +  packedmatrix* W = mzd_copy (NULL, B);
 847 +
 848 +  size_t i,j;
 849 +  for (i=0; i<n; ++i){
 850 +    for (j=i+1; j<n;++j)
 851 +      mzd_write_bit(L,i,j, 0);
 852 +    mzd_write_bit(L,i,i, 1);
 853 +  }
 854 +  mzd_trsm_lower_right (L, B, 2048);
 855 +
 856 +  mzd_addmul(W, B, L, 2048);
 857 +
 858 +  int status = 0;
 859 +  for ( i=0; i<m; ++i)
 860 +    for ( j=0; j<n; ++j){
 861 +      if (mzd_read_bit (W,i,j)){
 862 +	status = 1;
 863 +      }
 864 +    }
 865 +
 866 +  // Verifiying that nothing has been changed around the submatrices
 867 +  mzd_addmul(W, B, L, 2048);
 868 +  mzd_copy (B, W);
 869 +
 870 +  for ( i=0; i<2048; ++i)
 871 +    for ( j=0; j<2048/RADIX; ++j){
 872 +      if (Bbase->values [Bbase->rowswap[i] + j] != Bbasecopy->values [Bbasecopy->rowswap[i] + j]){
 873 +	status = 1;
 874 +      }
 875 +    }
 876 +  mzd_free_window (L);
 877 +  mzd_free_window (B);
 878 +  mzd_free (W);
 879 +  mzd_free(Lbase);
 880    mzd_free(Bbase);
 881    mzd_free(Bbasecopy);
 882  
 883 @@ -110,6 +163,62 @@ int test_trsm_lower_left (int m, int n, 
 884    return status;
 885  }
 886  
 887 +
 888 +
 889 +int test_trsm_upper_left (int m, int n, int offsetU, int offsetB){
 890 +  packedmatrix* Ubase = mzd_init (2048,2048);
 891 +  packedmatrix* Bbase = mzd_init (2048,2048);
 892 +  mzd_randomize (Ubase);
 893 +  mzd_randomize (Bbase);
 894 +  packedmatrix* Bbasecopy = mzd_copy (NULL, Bbase);
 895 +
 896 +  packedmatrix* U = mzd_init_window (Ubase, 0, offsetU, m, offsetU + m);
 897 +  packedmatrix* B = mzd_init_window (Bbase, 0, offsetB, m, offsetB + n);
 898 +  packedmatrix* W = mzd_copy (NULL, B);
 899 +
 900 +  size_t i,j;
 901 +  for (i=0; i<m; ++i){
 902 +    for (j=0; j<i;++j)
 903 +      mzd_write_bit(U,i,j, 0);
 904 +    mzd_write_bit(U,i,i, 1);
 905 +  }    
 906 +  mzd_trsm_upper_left(U, B, 2048);
 907 +  
 908 +  mzd_addmul(W, U, B, 2048);
 909 +
 910 +  int status = 0;
 911 +  for ( i=0; i<m; ++i)
 912 +    for ( j=0; j<n; ++j){
 913 +      if (mzd_read_bit (W,i,j)){
 914 +	status = 1;
 915 +      }
 916 +    }
 917 +
 918 +  // Verifiying that nothing has been changed around the submatrices
 919 +  mzd_addmul(W, U, B, 2048);
 920 +
 921 +  mzd_copy (B, W);
 922 +
 923 +  for ( i=0; i<2048; ++i)
 924 +    for ( j=0; j<2048/RADIX; ++j){
 925 +      if (Bbase->values [Bbase->rowswap[i] + j] != Bbasecopy->values [Bbasecopy->rowswap[i] + j]){
 926 +	status = 1;
 927 +      }
 928 +    }
 929 +  mzd_free_window (U);
 930 +  mzd_free_window (B);
 931 +  mzd_free_window (W);
 932 +  mzd_free(Ubase);
 933 +  mzd_free(Bbase);
 934 +  mzd_free(Bbasecopy);
 935 +
 936 +  if (!status)
 937 +    printf("passed\n");
 938 +  else
 939 +    printf("failed\n");
 940 +  return status;
 941 +}
 942 +
 943  int main(int argc, char **argv) {
 944    int status = 0;
 945  
 946 @@ -125,6 +234,19 @@ int main(int argc, char **argv) {
 947    status += test_trsm_upper_right(57, 80, 60);
 948    printf("UpperRight: larger, odd placed ... ");
 949    status += test_trsm_upper_right(1577, 1802, 189);
 950 +
 951 +  printf("LowerRight: small, even placed ... ");
 952 +  status += test_trsm_lower_right(57, 10, 0);
 953 +  printf("LowerRight: large, even placed ... ");
 954 +  status += test_trsm_lower_right(57, 150, 0);
 955 +  printf("LowerRight: small, odd placed  ... ");
 956 +  status += test_trsm_lower_right(57, 3, 4);
 957 +  printf("LowerRight: medium, odd placed ... ");
 958 +  status += test_trsm_lower_right(57, 4, 62);
 959 +  printf("LowerRight: large, odd placed  ... ");
 960 +  status += test_trsm_lower_right(57, 80, 60);
 961 +  printf("LowerRight: larger, odd placed ... ");
 962 +  status += test_trsm_lower_right(1577, 1802, 189);
 963  
 964    printf("LowerLeft: small L even, small B even ... ");
 965    status += test_trsm_lower_left (10, 20, 0, 0);
 966 @@ -171,7 +293,53 @@ int main(int argc, char **argv) {
 967    printf("LowerLeft: larger L odd, larger B odd ... ");
 968    status += test_trsm_lower_left (1764, 1345, 198, 123);
 969  
 970 -  
 971 +
 972 +  printf("UpperLeft: small U even, small B even ... ");
 973 +  status += test_trsm_upper_left (10, 20, 0, 0);
 974 +  printf("UpperLeft: small U even, large B even ... ");
 975 +  status += test_trsm_upper_left (10, 80, 0, 0);
 976 +
 977 +  printf("UpperLeft: small U even, small B odd  ... ");
 978 +  status += test_trsm_upper_left (10, 20, 0, 15);
 979 +  printf("UpperLeft: small U even, large B odd  ... ");
 980 +  status += test_trsm_upper_left (10, 80, 0, 15);
 981 +
 982 +  printf("UpperLeft: small U odd, small B even  ... ");
 983 +  status += test_trsm_upper_left (10, 20, 15, 0);
 984 +  printf("UpperLeft: small U odd, large B even  ... ");
 985 +  status += test_trsm_upper_left (10, 80, 15, 0);
 986 +
 987 +  printf("UpperLeft: small U odd, small B odd   ... ");
 988 +  status += test_trsm_upper_left (10, 20, 15, 20);
 989 +  printf("UpperLeft: small U odd, large B odd   ... ");
 990 +  status += test_trsm_upper_left (10, 80, 15, 20);
 991 +
 992 +  printf("UpperLeft: large U even, small B even ... ");
 993 +  status += test_trsm_upper_left (70, 20, 0, 0);
 994 +  printf("UpperLeft: large U even, large B even ... ");
 995 +  status += test_trsm_upper_left (70, 80, 0, 0);
 996 +
 997 +  printf("UpperLeft: large U even, small B odd  ... ");
 998 +  status += test_trsm_upper_left (70, 10, 0, 15);
 999 +  printf("UpperLeft: large U even, large B odd  ... ");
1000 +  status += test_trsm_upper_left (70, 80, 0, 15);
1001 +
1002 +  printf("UpperLeft: large U odd, small B even  ... ");
1003 +  status += test_trsm_upper_left (70, 20, 15, 0);
1004 +  printf("UpperLeft: large U odd, large B even  ... ");
1005 +  status += test_trsm_upper_left (70, 80, 15, 0);
1006 +
1007 +  printf("UpperLeft: large U odd, small B odd   ... ");
1008 +  status += test_trsm_upper_left (70, 20, 15, 20);
1009 +  printf("UpperLeft: large U odd, large B odd   ... ");
1010 +  status += test_trsm_upper_left (70, 80, 15, 20);
1011 +
1012 +  printf("UpperLeft: larger U odd, larger B odd ... ");
1013 +  status += test_trsm_upper_left (770, 1600, 75, 89);
1014 +  printf("UpperLeft: larger U odd, larger B odd ... ");
1015 +  status += test_trsm_upper_left (1764, 1345, 198, 123);
1016 +
1017 +
1018    if (!status) {
1019      printf("All tests passed.\n");
1020    }

Attached Files

To refer to attachments on a page, use attachment:filename, as shown below in the list of files. Do NOT use the URL of the [get] link, since this is subject to change and can break easily.

You are not allowed to attach a file to this page.