#include "nsbbsc.h"

void BSC_MatTriangSlvLU_CaDADBbC_double(
                 const int mb, const int n, 
                 const double *dvl, const double *dvr,
                 const double alpha,
                 const double *val, const int *bindx, 
                 const int *bpntrb, const int *bpntre, const int lb,
                 const double *b, const int ldb, const double beta, 
                 double *c, const int ldc,
                 double *work, const int ind_base)
{
  const double *pb;
  double *pc=c;
  double *cj;
  const double *pdr;
  const double *pdl;
  double *pwork=work;                             /* MTSDVLBETADEL */
  double *ptmpwork;                               /* MTSDVLBETADEL */
  const double *pval;
  const double *ptmp;
  int i,j,jb,je,block;
  int ds,bs,br,cs;
  int index;
  int ind;
  int l;                                               /* VECDEL */
  int ii,jj;
  int m=mb*lb;
  int mm=lb*lb;
  double t;

  bindx-=ind_base;

  /* First:                                                 MTSBETADEL */
  /*   Store beta*c in temp array                           MTSBETADEL */
  /*   freeing c for intermediate computations              MTSBETADEL */

  for (l=0;l!=n;l++)                              /* VECDEL MTSBETADEL */
    for (i=0;i!=m;i++) *pwork++ = beta * (*pc++);       /*  MTSBETADEL */

  ptmpwork = pwork;          /* End of the work area       MTSDVLBETADEL */
                             /* needed to store beta*c.    MTSDVLBETADEL */
                             /* (None needed if beta==0)   MTSDVLBETADEL */
                             /* Subsequent work must not   MTSDVLBETADEL */
                             /* overwrite initial portion  MTSDVLBETADEL */
                             /* of work array              MTSDVLBETADEL */
  /* Second:                                                           */
  /*   Calculate  C <- DVR*B                                           */
  for (i=0;i!=mb;i++) {      /* For each block column i                */
    jb = bpntrb[i];             /* beginning block in column           */
    je = bpntre[i];             /* ending block in column              */
    bs = i*lb;                  /* start of b and c                    */
    pb = &b[bs];                /* starting point for b                */
    pc = &c[bs];                /* starting point for c                */

    /* Store in c the block product dvr*b  */

    for (l=0;l!=n;l++){                                      /* VECDEL */
      for (j=0;j!=lb;j++) pc[j] = pb[j];
                    /* Rest of loop not needed if DVR = I    DVRDEL */
      pdr = &dvr[i*mm];                                   /* DVRDEL */
      for (j=0;j!=lb;j++) {                               /* DVRDEL */
        pc[j] *= (*pdr);                                  /* DVRDEL */
        pdr += lb+1;                                      /* DVRDEL */
      }                                                   /* DVRDEL */
      pdr = &dvr[i*mm];                                   /* DVRDEL */
      for (j=0;j!=lb;j++) {                               /* DVRDEL */
        for (ii=0;ii!=lb;ii++)                            /* DVRDEL */
          if ( ii == j ) pdr++;                           /* DVRDEL */
          else  pc[j] += (*pdr++) * pb[ii];               /* DVRDEL */
      }                                                   /* DVRDEL */
      pc+=ldc;pb+=ldb;                                       /* VECDEL */
    }                                                        /* VECDEL */
  }

  /* Now solve for unknown in A*c=c      */

  for (i=0;i!=mb;i++) {      /* For each block column i                */
    jb = bpntrb[i];             /* beginning block in column           */
    je = bpntre[i];             /* ending block in column              */
    cs = i*lb;                  /* start of b and c                    */

    for (j=jb;j!=je;j++) {      /* For each block j in this column     */
      index = bindx[j];             /* block row    index              */
      if ( index == i + ind_base ){ /* on diagonal                     */
        continue;                   /* assume identity block, skip     */
      } else {
        pval = &val[(j-1)*mm];      /* pointer to start of block       */
        ptmp = pval;                                         /* VECDEL */
                                 /* GEMM call inlined here:            */
        bs = (index-1)*lb;          
        pc = &c[bs];                /* start of block of c to modify   */
        cj = &c[cs];                /* start of this block of c        */
        for (l=0;l!=n;l++) {        /* For each column of c     VECDEL */    
          for (ii=0;ii!=lb;ii++) {      /* For each column of A block  */
            t = cj[ii];
            for (jj=0;jj!=lb;jj++) {        /* For row of work block   */
              pc[jj] -= (*pval++) * t;       /* Accumulate dot product */
            }
          }
          pc+=ldc;cj+=ldc;pval=ptmp; /* Skip to next column     VECDEL */
        }                                                    /* VECDEL */
      }
    }
  }

  /* Third:                                                            */
  /* Modify intermediate solution by DVL (block) scaling:              */

  ds = 0;                                                    /* DVLDEL */
  for (i=0;i!=mb;i++) {      /* For each block row i            DVLDEL */
    bs = i*lb;                  /* start of b and c             DVLDEL */
    pdl = &dvl[ds];                                          /* DVLDEL */
    ds += mm;                   /* keep track of place in dvr   DVLDEL */
    pc = &c[bs];                                             /* DVLDEL */
    pwork=ptmpwork;                                          /* DVLDEL */
    for (l=0;l!=n;l++) {        /* For each column of c  VECDEL DVLDEL */    
      for (j=0;j!=lb;j++) {     /* For each row of DVL block    DVLDEL */
        t = 0;                                               /* DVLDEL */
        for (jj=0;jj!=lb;jj++)                               /* DVLDEL */
          t+= pc[jj]*pdl[j+jj*lb];  /* Accumulate dot product   DVLDEL */
        pwork[j] = t;               /* Store in work array      DVLDEL */
      }                                                      /* DVLDEL */
        /* copy work array into c */
      for (j=0;j!=lb;j++) pc[j] = pwork[j];                  /* DVLDEL */ 
      pc+=ldc;                      /* Skip column       VECDEL DVLDEL */
    }                                                 /* VECDEL DVLDEL */
  }                                                          /* DVLDEL */

  /* Finally:                                BLKPOSTPROCESS */
  /* Scale by alpha and          ALPHA1DEL   BLKPOSTPROCESS */
  /*   add BETA * c (from work)  MTSBETADEL  BLKPOSTPROCESS */

  pc=c;                                   /* BLKPOSTPROCESS */
  pwork=work;                             /* BLKPOSTPROCESS    MTSBETADEL */
  for (l=0;l!=n;l++)                      /* BLKPOSTPROCESS        VECDEL */
    for (i=0;i!=m;i++) {                  /* BLKPOSTPROCESS */
          *pc *= alpha;                   /* BLKPOSTPROCESS    ALPHA1DEL  */
          *pc += *pwork;                  /* BLKPOSTPROCESS    MTSBETADEL */
          pc++;                           /* BLKPOSTPROCESS */
          pwork++;                        /* BLKPOSTPROCESS    MTSBETADEL */
    }                                     /* BLKPOSTPROCESS */

}


void BSC_MatTriangSlvUU_CaDADBbC_double(
                 const int mb, const int n,
                 const double *dvl, const double *dvr,
                 const double alpha,
                 const double *val, const int *bindx, 
                 const int *bpntrb, const int *bpntre, const int lb,
                 const double *b, const int ldb, const double beta, 
                 double *c, const int ldc,
                 double *work, const int ind_base)
{
  const double *pb;
  double *pc=c;
  double *cj;
  const double *pdr;
  const double *pdl;
  double *pwork=work;                            /* MTSDVLBETADEL */
  double *ptmpwork;                              /* MTSDVLBETADEL */
  const double *pval;
  const double *ptmp;
  int i,j,jb,je,block;
  int ds,bs,br,cs;
  int index;
  int ind;
  int l;                                               /* VECDEL */
  int ii,jj;
  int m=mb*lb;
  int mm=lb*lb;
  double t;

  bindx-=ind_base;

  /* First:                                                 MTSBETADEL */
  /*   Store beta*c in temp array                           MTSBETADEL */
  /*   freeing c for intermediate computations              MTSBETADEL */

  for (l=0;l!=n;l++)                              /* VECDEL MTSBETADEL */
    for (i=0;i!=m;i++) *pwork++ = beta * (*pc++);       /*  MTSBETADEL */

  ptmpwork = pwork;          /* End of the work area       MTSDVLBETADEL */
                             /* needed to store beta*c.    MTSDVLBETADEL */
                             /* (None needed if beta==0)   MTSDVLBETADEL */
                             /* Subsequent work must not   MTSDVLBETADEL */
                             /* overwrite initial portion  MTSDVLBETADEL */
                             /* of work array              MTSDVLBETADEL */
  /* Second:                                                           */
  /*   Calculate  C <- DVR*B                                           */
  for (i=0;i!=mb;i++) {      /* For each block row i                   */
    jb = bpntrb[i];             /* beginning block in row              */
    je = bpntre[i];             /* ending block in row                 */
    bs = i*lb;                  /* start of b and c                    */
    pb = &b[bs];                /* starting point for b                */
    pc = &c[bs];                /* starting point for c                */

    /* Store in c the block product dvr*b  */

    for (l=0;l!=n;l++){                                      /* VECDEL */
      for (j=0;j!=lb;j++) pc[j] = pb[j];
                    /* Rest of loop not needed if DVR = I    DVRDEL */
      pdr = &dvr[i*mm];                                    /* DVRDEL */
      for (j=0;j!=lb;j++) {                               /* DVRDEL */
        pc[j] *= (*pdr);                                  /* DVRDEL */
        pdr += lb+1;                                      /* DVRDEL */
      }                                                   /* DVRDEL */
      pdr = &dvr[i*mm];                                   /* DVRDEL */
      for (j=0;j!=lb;j++) {                               /* DVRDEL */
        for (ii=0;ii!=lb;ii++)                            /* DVRDEL */
          if ( ii == j ) pdr++;                           /* DVRDEL */
          else  pc[j] += (*pdr++) * pb[ii];               /* DVRDEL */
      }                                                   /* DVRDEL */
      pc+=ldc;pb+=ldb;                                       /* VECDEL */
    }                                                        /* VECDEL */
  }

  /* Now solve for unknown in A*c=c      */

  for (i=mb-1;i!=-1;i--) {      /* For each block column               */
    jb = bpntrb[i];             /* beginning block in column           */
    je = bpntre[i];             /* ending block in column              */
    cs = lb*i;                  /* start of b and c                    */

    for (j=jb;j!=je;j++) {      /* For each block j in this row        */
      index = bindx[j];             /* block column index              */
      if ( index == i + ind_base ){ /* on diagonal                     */
        continue;                   /* assume identity block, skip     */
      } else {
        pval = &val[(j-1)*mm];      /* pointer to start of block       */
        ptmp = pval;                                         /* VECDEL */
                                 /* GEMM call inlined here:            */
        bs = (index-1)*lb;          
        pc = &c[bs];                /* start of block of c to modify   */
        cj = &c[cs];                /* start of this block of c        */
        for (l=0;l!=n;l++) {        /* For each column of c     VECDEL */    
          for (ii=0;ii!=lb;ii++) {      /* For each column of A block  */
            t = cj[ii];
            for (jj=0;jj!=lb;jj++) {        /* For row of work block   */
              pc[jj] -= (*pval++) * t;       /* Accumulate dot product */
            }
          }
          pc+=ldc;cj+=ldc;pval=ptmp; /* Skip to next column     VECDEL */
        }                                                    /* VECDEL */
      }
    }
  }

  /* Third:                                                            */
  /* Modify intermediate solution by DVL (block) scaling:              */

  ds = 0;                                                    /* DVLDEL */
  for (i=0;i!=mb;i++) {      /* For each block row i            DVLDEL */
    bs = i*lb;                  /* start of b and c             DVLDEL */
    pdl = &dvl[ds];                                          /* DVLDEL */
    ds += mm;                   /* keep track of place in dvr   DVLDEL */
    pc = &c[bs];                                             /* DVLDEL */
    pwork=ptmpwork;                                          /* DVLDEL */
    for (l=0;l!=n;l++) {        /* For each column of c  VECDEL DVLDEL */    
      for (j=0;j!=lb;j++) {     /* For each row of DVL block    DVLDEL */
        t = 0;                                               /* DVLDEL */
        for (jj=0;jj!=lb;jj++)                               /* DVLDEL */
          t+= pc[jj]*pdl[j+jj*lb];  /* Accumulate dot product   DVLDEL */
        pwork[j] = t;               /* Store in work array      DVLDEL */
      }                                                      /* DVLDEL */
        /* copy work array into c */
      for (j=0;j!=lb;j++) pc[j] = pwork[j];                  /* DVLDEL */ 
      pc+=ldc;                      /* Skip column       VECDEL DVLDEL */
    }                                                 /* VECDEL DVLDEL */
  }                                                          /* DVLDEL */

  /* Finally:                                BLKPOSTPROCESS */
  /* Scale by alpha                          BLKPOSTPROCESS */
  /*    and add beta * c (from work)         BLKPOSTPROCESS */

  pc=c;                                   /* BLKPOSTPROCESS */
  pwork=work;                             /* BLKPOSTPROCESS    MTSBETADEL */
  for (l=0;l!=n;l++)                      /* BLKPOSTPROCESS        VECDEL */
    for (i=0;i!=m;i++) {                  /* BLKPOSTPROCESS */
          *pc *= alpha;                   /* BLKPOSTPROCESS    ALPHA1DEL  */
          *pc += *pwork;                  /* BLKPOSTPROCESS    MTSBETADEL */
          pc++;                           /* BLKPOSTPROCESS */
          pwork++;                        /* BLKPOSTPROCESS    MTSBETADEL */
    }                                     /* BLKPOSTPROCESS */


}

void BSC_MatTriangSlvLU_CaDATDBbC_double(
                 const int mb, const int n,
                 const double *dvl, const double *dvr,
                 const double alpha,
                 const double *val, const int *bindx, 
                 const int *bpntrb, const int *bpntre, const int lb,
                 const double *b, const int ldb, const double beta, 
                 double *c, const int ldc,
                 double *work, const int ind_base)
{
  const double *pb;
  double *pc=c;
  double *cj;
  const double *pdr;
  const double *pdl;
  double *pwork=work;                             /* MTSDVLBETADEL */
  double *ptmpwork;                               /* MTSDVLBETADEL */
  const double *pval;
  const double *ptmp;
  int i,j,jb,je,block;
  int ds,bs,br,cs;
  int index;
  int ind;
  int l;                                               /* VECDEL */
  int ii,jj;
  int m=mb*lb;
  int mm=lb*lb;
  double t;

  bindx-=ind_base;

  /* First:                                                 MTSBETADEL */
  /*   Store beta*c in temp array                           MTSBETADEL */
  /*   freeing c for intermediate computations              MTSBETADEL */

  for (l=0;l!=n;l++)                              /* VECDEL MTSBETADEL */
    for (i=0;i!=m;i++) *pwork++ = beta * (*pc++);       /*  MTSBETADEL */

  ptmpwork = pwork;        /* End of the work area       MTSDVLBETADEL */
                           /* needed to store beta*c.    MTSDVLBETADEL */
                           /* (None needed if beta==0)   MTSDVLBETADEL */
                           /* Subsequent work must not   MTSDVLBETADEL */
                           /* overwrite initial portion  MTSDVLBETADEL */
                           /* of work array              MTSDVLBETADEL */
  /* Second:                                                           */
  /*   Calculate  C <- DVR*B                                           */
  for (i=0;i!=mb;i++) {      /* For each block row i                   */
    jb = bpntrb[i];             /* beginning block in row              */
    je = bpntre[i];             /* ending block in row                 */
    bs = i*lb;                  /* start of b and c                    */
    pb = &b[bs];                /* starting point for b                */
    pc = &c[bs];                /* starting point for c                */

    /* Store in c the block product dvr*b  */

    for (l=0;l!=n;l++){                                      /* VECDEL */
      for (j=0;j!=lb;j++) pc[j] = pb[j];
                    /* Rest of loop not needed if DVR = I    DVRDEL */
      pdr = &dvr[i*mm];                                   /* DVRDEL */
      for (j=0;j!=lb;j++) {                               /* DVRDEL */
        pc[j] *= (*pdr);                                  /* DVRDEL */
        pdr += lb+1;                                      /* DVRDEL */
      }                                                   /* DVRDEL */
      pdr = &dvr[i*mm];                                   /* DVRDEL */
      for (j=0;j!=lb;j++) {                               /* DVRDEL */
        for (ii=0;ii!=lb;ii++)                            /* DVRDEL */
          if ( ii == j ) pdr++;                           /* DVRDEL */
          else  pc[j] += (*pdr++) * pb[ii];               /* DVRDEL */
      }                                                   /* DVRDEL */
      pc+=ldc;pb+=ldb;                                       /* VECDEL */
    }                                                        /* VECDEL */
  }

  /* Now solve for unknown in A^T*c=c      */

  for (i=mb-1;i!=-1;i--) {      /* For each block row i                */
    jb = bpntrb[i];             /* beginning block in row              */
    je = bpntre[i];             /* ending block in row                 */
    cs = i*lb;                  /* start of b and c                    */

    for (j=je-1;j!=jb-1;j--) {  /* For each block j in this row        */
      index = bindx[j];             /* block column index              */
      if ( index == i + ind_base ){ /* on diagonal                     */
        continue;                   /* assume identity block, skip     */
      } else {
        pval = &val[(j-1)*mm];      /* pointer to start of block       */
        ptmp = pval;                                         /* VECDEL */
                                 /* GEMM call inlined here:            */
        bs = (index-1)*lb;
        pc = &c[cs];                /* start of block of c to modify   */
        cj = &c[bs];                /* start of this block of c        */
        for (l=0;l!=n;l++) {        /* For each column of c     VECDEL */    
          for (ii=0;ii!=lb;ii++) {      /* For each row of A' block    */
            t = 0;
            for (jj=0;jj!=lb;jj++) {        /* For row of work block   */
              t += (*pval++) * cj[jj];      /* Accumulate dot product  */
            }
            pc[ii] -= t;       
          }
          pc+=ldc;cj+=ldc;pval=ptmp; /* Skip to next column     VECDEL */
        }                                                    /* VECDEL */
      }
    }
  }

  /* Third:                                                            */
  /* Modify intermediate solution by DVL (block) scaling:              */

  ds = 0;                                                    /* DVLDEL */
  for (i=0;i!=mb;i++) {      /* For each block row i            DVLDEL */
    bs = i*lb;                  /* start of b and c             DVLDEL */
    pdl = &dvl[ds];                                          /* DVLDEL */
    ds += mm;                   /* keep track of place in dvr   DVLDEL */
    pc = &c[bs];                                             /* DVLDEL */
    pwork=ptmpwork;                                          /* DVLDEL */
    for (l=0;l!=n;l++) {        /* For each column of c  VECDEL DVLDEL */    
      for (j=0;j!=lb;j++) {     /* For each row of DVL block    DVLDEL */
        t = 0;                                               /* DVLDEL */
        for (jj=0;jj!=lb;jj++)                               /* DVLDEL */
          t+= pc[jj]*pdl[j+jj*lb];  /* Accumulate dot product   DVLDEL */
        pwork[j] = t;               /* Store in work array      DVLDEL */
      }                                                      /* DVLDEL */
        /* copy work array into c */
      for (j=0;j!=lb;j++) pc[j] = pwork[j];                  /* DVLDEL */ 
      pc+=ldc;                      /* Skip column       VECDEL DVLDEL */
    }                                                 /* VECDEL DVLDEL */
  }                                                          /* DVLDEL */

  /* Finally:                                BLKPOSTPROCESS */
  /* Scale by alpha                          BLKPOSTPROCESS */
  /*    and add beta * c (from work)         BLKPOSTPROCESS */

  pc=c;                                   /* BLKPOSTPROCESS */
  pwork=work;                             /* BLKPOSTPROCESS    MTSBETADEL */
  for (l=0;l!=n;l++)                      /* BLKPOSTPROCESS        VECDEL */
    for (i=0;i!=m;i++) {                  /* BLKPOSTPROCESS */
          *pc *= alpha;                   /* BLKPOSTPROCESS    ALPHA1DEL  */
          *pc += *pwork;                  /* BLKPOSTPROCESS    MTSBETADEL */
          pc++;                           /* BLKPOSTPROCESS */
          pwork++;                        /* BLKPOSTPROCESS    MTSBETADEL */
    }                                     /* BLKPOSTPROCESS */


}

void BSC_MatTriangSlvUU_CaDATDBbC_double(
                 const int mb, const int n, 
                 const double *dvl, const double *dvr,
                 const double alpha,
                 const double *val, const int *bindx, 
                 const int *bpntrb, const int *bpntre, const int lb,
                 const double *b, const int ldb, const double beta, 
                 double *c, const int ldc,
                 double *work, const int ind_base)
{
  const double *pb;
  double *pc=c;
  double *cj;
  const double *pdr;
  const double *pdl;
  double *pwork=work;                             /* MTSDVLBETADEL */
  double *ptmpwork;                               /* MTSDVLBETADEL */
  const double *pval;
  const double *ptmp;
  int i,j,jb,je,block;
  int ds,bs,br,cs;
  int index;
  int ind;
  int l;                                               /* VECDEL */
  int ii,jj;
  int m=mb*lb;
  int mm=lb*lb;
  double t;

  bindx-=ind_base;

  /* First:                                                 MTSBETADEL */
  /*   Store beta*c in temp array                           MTSBETADEL */
  /*   freeing c for intermediate computations              MTSBETADEL */

  for (l=0;l!=n;l++)                              /* VECDEL MTSBETADEL */
    for (i=0;i!=m;i++) *pwork++ = beta * (*pc++);       /*  MTSBETADEL */

  ptmpwork = pwork;          /* End of the work area       MTSDVLBETADEL */
                             /* needed to store beta*c.    MTSDVLBETADEL */
                             /* (None needed if beta==0)   MTSDVLBETADEL */
                             /* Subsequent work must not   MTSDVLBETADEL */
                             /* overwrite initial portion  MTSDVLBETADEL */
                             /* of work array              MTSDVLBETADEL */
  /* Second:                                                           */
  /*   Calculate  C <- DVR*B                                           */
  for (i=0;i!=mb;i++) {      /* For each block row i                   */
    jb = bpntrb[i];             /* beginning block in row              */
    je = bpntre[i];             /* ending block in row                 */
    bs = i*lb;                  /* start of b and c                    */
    pb = &b[bs];                /* starting point for b                */
    pc = &c[bs];                /* starting point for c                */

    /* Store in c the block product dvr*b  */

    for (l=0;l!=n;l++){                                      /* VECDEL */
      for (j=0;j!=lb;j++) pc[j] = pb[j];
                    /* Rest of loop not needed if DVR = I    DVRDEL */
      pdr = &dvr[i*mm];                                   /* DVRDEL */
      for (j=0;j!=lb;j++) {                               /* DVRDEL */
        pc[j] *= (*pdr);                                  /* DVRDEL */
        pdr += lb+1;                                      /* DVRDEL */
      }                                                   /* DVRDEL */
      pdr = &dvr[i*mm];                                   /* DVRDEL */
      for (j=0;j!=lb;j++) {                               /* DVRDEL */
        for (ii=0;ii!=lb;ii++)                            /* DVRDEL */
          if ( ii == j ) pdr++;                           /* DVRDEL */
          else  pc[j] += (*pdr++) * pb[ii];               /* DVRDEL */
      }                                                   /* DVRDEL */
      pc+=ldc;pb+=ldb;                                       /* VECDEL */
    }                                                        /* VECDEL */
  }

  /* Now solve for unknown in A^T*c=c      */

  for (i=0;i!=mb;i++) {      /* For each block row i                   */
    jb = bpntrb[i];             /* beginning block in row              */
    je = bpntre[i];             /* ending block in row                 */
    cs = i*lb;                  /* start of b and c                    */

    for (j=jb;j!=je;j++) {      /* For each block j in this row        */
      index = bindx[j];             /* block column index              */
      if ( index == i + ind_base ){ /* on diagonal                     */
        continue;                   /* assume identity block, skip     */
      } else {
        pval = &val[(j-1)*mm];      /* pointer to start of block       */
        ptmp = pval;                                         /* VECDEL */
                                 /* GEMM call inlined here:            */
        bs = (index-1)*lb;          
        pc = &c[cs];                /* start of block of c to modify   */
        cj = &c[bs];                /* start of this block of c        */
        for (l=0;l!=n;l++) {        /* For each column of c     VECDEL */    
          for (ii=0;ii!=lb;ii++) {      /* For each column of A block  */
            t = 0;
            for (jj=0;jj!=lb;jj++) {        /* For row of work block   */
              t += (*pval++) * cj[jj];      /* Accumulate dot product  */
            }
            pc[ii] -= t;
          }
          pc+=ldc;cj+=ldc;pval=ptmp; /* Skip to next column     VECDEL */
        }                                                    /* VECDEL */
      }
    }
  }

  /* Third:                                                            */
  /* Modify intermediate solution by DVL (block) scaling:              */

  ds = 0;                                                    /* DVLDEL */
  for (i=0;i!=mb;i++) {      /* For each block row i            DVLDEL */
    bs = i*lb;                  /* start of b and c             DVLDEL */
    pdl = &dvl[ds];                                          /* DVLDEL */
    ds += mm;                /* keep track of place in dvr   DVLDEL */
    pc = &c[bs];                                             /* DVLDEL */
    pwork=ptmpwork;                                          /* DVLDEL */
    for (l=0;l!=n;l++) {        /* For each column of c  VECDEL DVLDEL */    
      for (j=0;j!=lb;j++) {     /* For each row of DVL block    DVLDEL */
        t = 0;                                               /* DVLDEL */
        for (jj=0;jj!=lb;jj++)                               /* DVLDEL */
          t+= pc[jj]*pdl[j+jj*lb];  /* Accumulate dot product   DVLDEL */
        pwork[j] = t;               /* Store in work array      DVLDEL */
      }                                                      /* DVLDEL */
        /* copy work array into c */
      for (j=0;j!=lb;j++) pc[j] = pwork[j];                  /* DVLDEL */ 
      pc+=ldc;                      /* Skip column       VECDEL DVLDEL */
    }                                                 /* VECDEL DVLDEL */
  }                                                          /* DVLDEL */

  /* Finally:                                BLKPOSTPROCESS */
  /* Scale by alpha                          BLKPOSTPROCESS */
  /*    and add beta * c (from work)         BLKPOSTPROCESS */

  pc=c;                                   /* BLKPOSTPROCESS */
  pwork=work;                             /* BLKPOSTPROCESS    MTSBETADEL */
  for (l=0;l!=n;l++)                      /* BLKPOSTPROCESS        VECDEL */
    for (i=0;i!=m;i++) {                  /* BLKPOSTPROCESS */
          *pc *= alpha;                   /* BLKPOSTPROCESS    ALPHA1DEL  */
          *pc += *pwork;                  /* BLKPOSTPROCESS    MTSBETADEL */
          pc++;                           /* BLKPOSTPROCESS */
          pwork++;                        /* BLKPOSTPROCESS    MTSBETADEL */
    }                                     /* BLKPOSTPROCESS */


}
