Actual source code: vpscat.c
  2: /*
  3:     Defines parallel vector scatters.
  4: */
  6: #include <private/isimpl.h>
  7: #include <private/vecimpl.h>         /*I "petscvec.h" I*/
  8: #include <../src/vec/vec/impls/dvecimpl.h>
  9: #include <../src/vec/vec/impls/mpi/pvecimpl.h>
 13: PetscErrorCode VecScatterView_MPI(VecScatter ctx,PetscViewer viewer)
 14: {
 15:   VecScatter_MPI_General *to=(VecScatter_MPI_General*)ctx->todata;
 16:   VecScatter_MPI_General *from=(VecScatter_MPI_General*)ctx->fromdata;
 17:   PetscErrorCode         ierr;
 18:   PetscInt               i;
 19:   PetscMPIInt            rank;
 20:   PetscViewerFormat      format;
 21:   PetscBool              iascii;
 24:   PetscTypeCompare((PetscObject)viewer,PETSCVIEWERASCII,&iascii);
 25:   if (iascii) {
 26:     MPI_Comm_rank(((PetscObject)ctx)->comm,&rank);
 27:     PetscViewerGetFormat(viewer,&format);
 28:     if (format ==  PETSC_VIEWER_ASCII_INFO) {
 29:       PetscInt nsend_max,nrecv_max,lensend_max,lenrecv_max,alldata,itmp;
 31:       MPI_Reduce(&to->n,&nsend_max,1,MPIU_INT,MPI_MAX,0,((PetscObject)ctx)->comm);
 32:       MPI_Reduce(&from->n,&nrecv_max,1,MPIU_INT,MPI_MAX,0,((PetscObject)ctx)->comm);
 33:       itmp = to->starts[to->n+1];
 34:       MPI_Reduce(&itmp,&lensend_max,1,MPIU_INT,MPI_MAX,0,((PetscObject)ctx)->comm);
 35:       itmp = from->starts[from->n+1];
 36:       MPI_Reduce(&itmp,&lenrecv_max,1,MPIU_INT,MPI_MAX,0,((PetscObject)ctx)->comm);
 37:       MPI_Reduce(&itmp,&alldata,1,MPIU_INT,MPI_SUM,0,((PetscObject)ctx)->comm);
 39:       PetscViewerASCIIPrintf(viewer,"VecScatter statistics\n");
 40:       PetscViewerASCIIPrintf(viewer,"  Maximum number sends %D\n",nsend_max);
 41:       PetscViewerASCIIPrintf(viewer,"  Maximum number receives %D\n",nrecv_max);
 42:       PetscViewerASCIIPrintf(viewer,"  Maximum data sent %D\n",(int)(lensend_max*to->bs*sizeof(PetscScalar)));
 43:       PetscViewerASCIIPrintf(viewer,"  Maximum data received %D\n",(int)(lenrecv_max*to->bs*sizeof(PetscScalar)));
 44:       PetscViewerASCIIPrintf(viewer,"  Total data sent %D\n",(int)(alldata*to->bs*sizeof(PetscScalar)));
 46:     } else {
 47:       PetscViewerASCIISynchronizedAllow(viewer,PETSC_TRUE);
 48:       PetscViewerASCIISynchronizedPrintf(viewer,"[%d] Number sends = %D; Number to self = %D\n",rank,to->n,to->local.n);
 49:       if (to->n) {
 50:         for (i=0; i<to->n; i++){
 51:           PetscViewerASCIISynchronizedPrintf(viewer,"[%d]   %D length = %D to whom %D\n",rank,i,to->starts[i+1]-to->starts[i],to->procs[i]);
 52:         }
 53:         PetscViewerASCIISynchronizedPrintf(viewer,"Now the indices for all remote sends (in order by process sent to)\n");
 54:         for (i=0; i<to->starts[to->n]; i++){
 55:           PetscViewerASCIISynchronizedPrintf(viewer,"[%d] %D \n",rank,to->indices[i]);
 56:         }
 57:       }
 59:       PetscViewerASCIISynchronizedPrintf(viewer,"[%d] Number receives = %D; Number from self = %D\n",rank,from->n,from->local.n);
 60:       if (from->n) {
 61:         for (i=0; i<from->n; i++){
 62:           PetscViewerASCIISynchronizedPrintf(viewer,"[%d] %D length %D from whom %D\n",rank,i,from->starts[i+1]-from->starts[i],from->procs[i]);
 63:         }
 65:         PetscViewerASCIISynchronizedPrintf(viewer,"Now the indices for all remote receives (in order by process received from)\n");
 66:         for (i=0; i<from->starts[from->n]; i++){
 67:           PetscViewerASCIISynchronizedPrintf(viewer,"[%d] %D \n",rank,from->indices[i]);
 68:         }
 69:       }
 70:       if (to->local.n) {
 71:         PetscViewerASCIISynchronizedPrintf(viewer,"[%d] Indices for local part of scatter\n",rank);
 72:         for (i=0; i<to->local.n; i++){
 73:           PetscViewerASCIISynchronizedPrintf(viewer,"[%d] From %D to %D \n",rank,from->local.vslots[i],to->local.vslots[i]);
 74:         }
 75:       }
 77:       PetscViewerFlush(viewer);
 78:       PetscViewerASCIISynchronizedAllow(viewer,PETSC_FALSE);
 79:     }
 80:   } else SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Viewer type %s not supported for this scatter",((PetscObject)viewer)->type_name);
 81:   return(0);
 82: }
 84: /* -----------------------------------------------------------------------------------*/
 85: /*
 86:       The next routine determines what part of  the local part of the scatter is an
 87:   exact copy of values into their current location. We check this here and
 88:   then know that we need not perform that portion of the scatter when the vector is
 89:   scattering to itself with INSERT_VALUES.
 91:      This is currently not used but would speed up, for example DMDALocalToLocalBegin/End()
 93: */
 96: PetscErrorCode VecScatterLocalOptimize_Private(VecScatter scatter,VecScatter_Seq_General *to,VecScatter_Seq_General *from)
 97: {
 98:   PetscInt       n = to->n,n_nonmatching = 0,i,*to_slots = to->vslots,*from_slots = from->vslots;
100:   PetscInt       *nto_slots,*nfrom_slots,j = 0;
101: 
103:   for (i=0; i<n; i++) {
104:     if (to_slots[i] != from_slots[i]) n_nonmatching++;
105:   }
107:   if (!n_nonmatching) {
108:     to->nonmatching_computed = PETSC_TRUE;
109:     to->n_nonmatching        = from->n_nonmatching = 0;
110:     PetscInfo1(scatter,"Reduced %D to 0\n", n);
111:   } else if (n_nonmatching == n) {
112:     to->nonmatching_computed = PETSC_FALSE;
113:     PetscInfo(scatter,"All values non-matching\n");
114:   } else {
115:     to->nonmatching_computed= PETSC_TRUE;
116:     to->n_nonmatching       = from->n_nonmatching = n_nonmatching;
117:     PetscMalloc(n_nonmatching*sizeof(PetscInt),&nto_slots);
118:     PetscMalloc(n_nonmatching*sizeof(PetscInt),&nfrom_slots);
119:     to->slots_nonmatching   = nto_slots;
120:     from->slots_nonmatching = nfrom_slots;
121:     for (i=0; i<n; i++) {
122:       if (to_slots[i] != from_slots[i]) {
123:         nto_slots[j]   = to_slots[i];
124:         nfrom_slots[j] = from_slots[i];
125:         j++;
126:       }
127:     }
128:     PetscInfo2(scatter,"Reduced %D to %D\n",n,n_nonmatching);
129:   }
130:   return(0);
131: }
133: /* --------------------------------------------------------------------------------------*/
135: /* -------------------------------------------------------------------------------------*/
138: PetscErrorCode VecScatterDestroy_PtoP(VecScatter ctx)
139: {
140:   VecScatter_MPI_General *to   = (VecScatter_MPI_General*)ctx->todata;
141:   VecScatter_MPI_General *from = (VecScatter_MPI_General*)ctx->fromdata;
142:   PetscErrorCode         ierr;
143:   PetscInt               i;
146:   if (to->use_readyreceiver) {
147:     /*
148:        Since we have already posted sends we must cancel them before freeing 
149:        the requests
150:     */
151:     for (i=0; i<from->n; i++) {
152:       MPI_Cancel(from->requests+i);
153:     }
154:     for (i=0; i<to->n; i++) {
155:       MPI_Cancel(to->rev_requests+i);
156:     }
157:     MPI_Waitall(from->n,from->requests,to->rstatus);
158:     MPI_Waitall(to->n,to->rev_requests,to->rstatus);
159:   }
161: #if defined(PETSC_HAVE_MPI_ALLTOALLW) && !defined(PETSC_USE_64BIT_INDICES)
162:   if (to->use_alltoallw) {
163:     PetscFree3(to->wcounts,to->wdispls,to->types);
164:     PetscFree3(from->wcounts,from->wdispls,from->types);
165:   }
166: #endif
168: #if defined(PETSC_HAVE_MPI_WIN_CREATE)
169:   if (to->use_window) {
170:     MPI_Win_free(&from->window);
171:     MPI_Win_free(&to->window);
172:     PetscFree(from->winstarts);
173:     PetscFree(to->winstarts);
174:   }
175: #endif
177:   if (to->use_alltoallv) {
178:     PetscFree2(to->counts,to->displs);
179:     PetscFree2(from->counts,from->displs);
180:   }
182:   /* release MPI resources obtained with MPI_Send_init() and MPI_Recv_init() */
183:   /* 
184:      IBM's PE version of MPI has a bug where freeing these guys will screw up later
185:      message passing.
186:   */
187: #if !defined(PETSC_HAVE_BROKEN_REQUEST_FREE)
188:   if (!to->use_alltoallv && !to->use_window) {   /* currently the to->requests etc are ALWAYS allocated even if not used */
189:     if (to->requests) {
190:       for (i=0; i<to->n; i++) {
191:         MPI_Request_free(to->requests + i);
192:       }
193:     }
194:     if (to->rev_requests) {
195:       for (i=0; i<to->n; i++) {
196:         MPI_Request_free(to->rev_requests + i);
197:       }
198:     }
199:   }
200:   /*
201:       MPICH could not properly cancel requests thus with ready receiver mode we
202:     cannot free the requests. It may be fixed now, if not then put the following 
203:     code inside a if !to->use_readyreceiver) {
204:   */
205:   if (!to->use_alltoallv && !to->use_window) {    /* currently the from->requests etc are ALWAYS allocated even if not used */
206:     if (from->requests) {
207:       for (i=0; i<from->n; i++) {
208:         MPI_Request_free(from->requests + i);
209:       }
210:     }
212:     if (from->rev_requests) {
213:       for (i=0; i<from->n; i++) {
214:         MPI_Request_free(from->rev_requests + i);
215:       }
216:     }
217:   }
218: #endif
220:   PetscFree(to->local.vslots);
221:   PetscFree(from->local.vslots);
222:   PetscFree2(to->counts,to->displs);
223:   PetscFree2(from->counts,from->displs);
224:   PetscFree(to->local.slots_nonmatching);
225:   PetscFree(from->local.slots_nonmatching);
226:   PetscFree(to->rev_requests);
227:   PetscFree(from->rev_requests);
228:   PetscFree(to->requests);
229:   PetscFree(from->requests);
230:   PetscFree4(to->values,to->indices,to->starts,to->procs);
231:   PetscFree2(to->sstatus,to->rstatus);
232:   PetscFree4(from->values,from->indices,from->starts,from->procs);
233:   PetscFree(from);
234:   PetscFree(to);
235: #if defined(PETSC_HAVE_CUSP)
236:   PetscCUSPIndicesDestroy((PetscCUSPIndices*)&ctx->spptr);
237: #endif
238:   return(0);
239: }
243: /* --------------------------------------------------------------------------------------*/
244: /*
245:     Special optimization to see if the local part of the scatter is actually 
246:     a copy. The scatter routines call PetscMemcpy() instead.
247:  
248: */
251: PetscErrorCode VecScatterLocalOptimizeCopy_Private(VecScatter scatter,VecScatter_Seq_General *to,VecScatter_Seq_General *from,PetscInt bs)
252: {
253:   PetscInt       n = to->n,i,*to_slots = to->vslots,*from_slots = from->vslots;
254:   PetscInt       to_start,from_start;
258:   to_start   = to_slots[0];
259:   from_start = from_slots[0];
261:   for (i=1; i<n; i++) {
262:     to_start   += bs;
263:     from_start += bs;
264:     if (to_slots[i]   != to_start)   return(0);
265:     if (from_slots[i] != from_start) return(0);
266:   }
267:   to->is_copy       = PETSC_TRUE;
268:   to->copy_start    = to_slots[0];
269:   to->copy_length   = bs*sizeof(PetscScalar)*n;
270:   from->is_copy     = PETSC_TRUE;
271:   from->copy_start  = from_slots[0];
272:   from->copy_length = bs*sizeof(PetscScalar)*n;
273:   PetscInfo(scatter,"Local scatter is a copy, optimizing for it\n");
274:   return(0);
275: }
277: /* --------------------------------------------------------------------------------------*/
281: PetscErrorCode VecScatterCopy_PtoP_X(VecScatter in,VecScatter out)
282: {
283:   VecScatter_MPI_General *in_to   = (VecScatter_MPI_General*)in->todata;
284:   VecScatter_MPI_General *in_from = (VecScatter_MPI_General*)in->fromdata,*out_to,*out_from;
285:   PetscErrorCode         ierr;
286:   PetscInt               ny,bs = in_from->bs;
289:   out->begin     = in->begin;
290:   out->end       = in->end;
291:   out->copy      = in->copy;
292:   out->destroy   = in->destroy;
293:   out->view      = in->view;
295:   /* allocate entire send scatter context */
296:   PetscNewLog(out,VecScatter_MPI_General,&out_to);
297:   PetscNewLog(out,VecScatter_MPI_General,&out_from);
299:   ny                = in_to->starts[in_to->n];
300:   out_to->n         = in_to->n;
301:   out_to->type      = in_to->type;
302:   out_to->sendfirst = in_to->sendfirst;
304:   PetscMalloc(out_to->n*sizeof(MPI_Request),&out_to->requests);
305:   PetscMalloc4(bs*ny,PetscScalar,&out_to->values,ny,PetscInt,&out_to->indices,out_to->n+1,PetscInt,&out_to->starts,out_to->n,PetscMPIInt,&out_to->procs);
306:   PetscMalloc2(PetscMax(in_to->n,in_from->n),MPI_Status,&out_to->sstatus,PetscMax(in_to->n,in_from->n),MPI_Status,&out_to->rstatus);
307:   PetscMemcpy(out_to->indices,in_to->indices,ny*sizeof(PetscInt));
308:   PetscMemcpy(out_to->starts,in_to->starts,(out_to->n+1)*sizeof(PetscInt));
309:   PetscMemcpy(out_to->procs,in_to->procs,(out_to->n)*sizeof(PetscMPIInt));
310: 
311:   out->todata       = (void*)out_to;
312:   out_to->local.n   = in_to->local.n;
313:   out_to->local.nonmatching_computed = PETSC_FALSE;
314:   out_to->local.n_nonmatching        = 0;
315:   out_to->local.slots_nonmatching    = 0;
316:   if (in_to->local.n) {
317:     PetscMalloc(in_to->local.n*sizeof(PetscInt),&out_to->local.vslots);
318:     PetscMalloc(in_from->local.n*sizeof(PetscInt),&out_from->local.vslots);
319:     PetscMemcpy(out_to->local.vslots,in_to->local.vslots,in_to->local.n*sizeof(PetscInt));
320:     PetscMemcpy(out_from->local.vslots,in_from->local.vslots,in_from->local.n*sizeof(PetscInt));
321:   } else {
322:     out_to->local.vslots   = 0;
323:     out_from->local.vslots = 0;
324:   }
326:   /* allocate entire receive context */
327:   out_from->type      = in_from->type;
328:   ny                  = in_from->starts[in_from->n];
329:   out_from->n         = in_from->n;
330:   out_from->sendfirst = in_from->sendfirst;
332:   PetscMalloc(out_from->n*sizeof(MPI_Request),&out_from->requests);
333:   PetscMalloc4(ny*bs,PetscScalar,&out_from->values,ny,PetscInt,&out_from->indices,out_from->n+1,PetscInt,&out_from->starts,out_from->n,PetscMPIInt,&out_from->procs);
334:   PetscMemcpy(out_from->indices,in_from->indices,ny*sizeof(PetscInt));
335:   PetscMemcpy(out_from->starts,in_from->starts,(out_from->n+1)*sizeof(PetscInt));
336:   PetscMemcpy(out_from->procs,in_from->procs,(out_from->n)*sizeof(PetscMPIInt));
337:   out->fromdata       = (void*)out_from;
338:   out_from->local.n   = in_from->local.n;
339:   out_from->local.nonmatching_computed = PETSC_FALSE;
340:   out_from->local.n_nonmatching        = 0;
341:   out_from->local.slots_nonmatching    = 0;
343:   /* 
344:       set up the request arrays for use with isend_init() and irecv_init()
345:   */
346:   {
347:     PetscMPIInt tag;
348:     MPI_Comm    comm;
349:     PetscInt    *sstarts = out_to->starts,  *rstarts = out_from->starts;
350:     PetscMPIInt *sprocs  = out_to->procs,   *rprocs  = out_from->procs;
351:     PetscInt    i;
352:     PetscBool   flg;
353:     MPI_Request *swaits  = out_to->requests,*rwaits  = out_from->requests;
354:     MPI_Request *rev_swaits,*rev_rwaits;
355:     PetscScalar *Ssvalues = out_to->values, *Srvalues = out_from->values;
357:     PetscMalloc(in_to->n*sizeof(MPI_Request),&out_to->rev_requests);
358:     PetscMalloc(in_from->n*sizeof(MPI_Request),&out_from->rev_requests);
360:     rev_rwaits = out_to->rev_requests;
361:     rev_swaits = out_from->rev_requests;
363:     out_from->bs = out_to->bs = bs;
364:     tag     = ((PetscObject)out)->tag;
365:     comm    = ((PetscObject)out)->comm;
367:     /* Register the receives that you will use later (sends for scatter reverse) */
368:     for (i=0; i<out_from->n; i++) {
369:       MPI_Recv_init(Srvalues+bs*rstarts[i],bs*rstarts[i+1]-bs*rstarts[i],MPIU_SCALAR,rprocs[i],tag,comm,rwaits+i);
370:       MPI_Send_init(Srvalues+bs*rstarts[i],bs*rstarts[i+1]-bs*rstarts[i],MPIU_SCALAR,rprocs[i],tag,comm,rev_swaits+i);
371:     }
373:     flg  = PETSC_FALSE;
374:     PetscOptionsGetBool(PETSC_NULL,"-vecscatter_rsend",&flg,PETSC_NULL);
375:     if (flg) {
376:       out_to->use_readyreceiver    = PETSC_TRUE;
377:       out_from->use_readyreceiver  = PETSC_TRUE;
378:       for (i=0; i<out_to->n; i++) {
379:         MPI_Rsend_init(Ssvalues+bs*sstarts[i],bs*sstarts[i+1]-bs*sstarts[i],MPIU_SCALAR,sprocs[i],tag,comm,swaits+i);
380:       }
381:       if (out_from->n) {MPI_Startall_irecv(out_from->starts[out_from->n]*out_from->bs,out_from->n,out_from->requests);}
382:       MPI_Barrier(comm);
383:       PetscInfo(in,"Using VecScatter ready receiver mode\n");
384:     } else {
385:       out_to->use_readyreceiver    = PETSC_FALSE;
386:       out_from->use_readyreceiver  = PETSC_FALSE;
387:       flg                          = PETSC_FALSE;
388:       PetscOptionsGetBool(PETSC_NULL,"-vecscatter_ssend",&flg,PETSC_NULL);
389:       if (flg) {
390:         PetscInfo(in,"Using VecScatter Ssend mode\n");
391:       }
392:       for (i=0; i<out_to->n; i++) {
393:         if (!flg) {
394:           MPI_Send_init(Ssvalues+bs*sstarts[i],bs*sstarts[i+1]-bs*sstarts[i],MPIU_SCALAR,sprocs[i],tag,comm,swaits+i);
395:         } else {
396:           MPI_Ssend_init(Ssvalues+bs*sstarts[i],bs*sstarts[i+1]-bs*sstarts[i],MPIU_SCALAR,sprocs[i],tag,comm,swaits+i);
397:         }
398:       }
399:     }
400:     /* Register receives for scatter reverse */
401:     for (i=0; i<out_to->n; i++) {
402:       MPI_Recv_init(Ssvalues+bs*sstarts[i],bs*sstarts[i+1]-bs*sstarts[i],MPIU_SCALAR,sprocs[i],tag,comm,rev_rwaits+i);
403:     }
404:   }
406:   return(0);
407: }
411: PetscErrorCode VecScatterCopy_PtoP_AllToAll(VecScatter in,VecScatter out)
412: {
413:   VecScatter_MPI_General *in_to   = (VecScatter_MPI_General*)in->todata;
414:   VecScatter_MPI_General *in_from = (VecScatter_MPI_General*)in->fromdata,*out_to,*out_from;
415:   PetscErrorCode         ierr;
416:   PetscInt               ny,bs = in_from->bs;
417:   PetscMPIInt            size;
420:   MPI_Comm_size(((PetscObject)in)->comm,&size);
421:   out->begin     = in->begin;
422:   out->end       = in->end;
423:   out->copy      = in->copy;
424:   out->destroy   = in->destroy;
425:   out->view      = in->view;
427:   /* allocate entire send scatter context */
428:   PetscNewLog(out,VecScatter_MPI_General,&out_to);
429:   PetscNewLog(out,VecScatter_MPI_General,&out_from);
431:   ny                = in_to->starts[in_to->n];
432:   out_to->n         = in_to->n;
433:   out_to->type      = in_to->type;
434:   out_to->sendfirst = in_to->sendfirst;
436:   PetscMalloc(out_to->n*sizeof(MPI_Request),&out_to->requests);
437:   PetscMalloc4(bs*ny,PetscScalar,&out_to->values,ny,PetscInt,&out_to->indices,out_to->n+1,PetscInt,&out_to->starts,out_to->n,PetscMPIInt,&out_to->procs);
438:   PetscMalloc2(PetscMax(in_to->n,in_from->n),MPI_Status,&out_to->sstatus,PetscMax(in_to->n,in_from->n),MPI_Status,&out_to->rstatus);
439:   PetscMemcpy(out_to->indices,in_to->indices,ny*sizeof(PetscInt));
440:   PetscMemcpy(out_to->starts,in_to->starts,(out_to->n+1)*sizeof(PetscInt));
441:   PetscMemcpy(out_to->procs,in_to->procs,(out_to->n)*sizeof(PetscMPIInt));
442: 
443:   out->todata       = (void*)out_to;
444:   out_to->local.n   = in_to->local.n;
445:   out_to->local.nonmatching_computed = PETSC_FALSE;
446:   out_to->local.n_nonmatching        = 0;
447:   out_to->local.slots_nonmatching    = 0;
448:   if (in_to->local.n) {
449:     PetscMalloc(in_to->local.n*sizeof(PetscInt),&out_to->local.vslots);
450:     PetscMalloc(in_from->local.n*sizeof(PetscInt),&out_from->local.vslots);
451:     PetscMemcpy(out_to->local.vslots,in_to->local.vslots,in_to->local.n*sizeof(PetscInt));
452:     PetscMemcpy(out_from->local.vslots,in_from->local.vslots,in_from->local.n*sizeof(PetscInt));
453:   } else {
454:     out_to->local.vslots   = 0;
455:     out_from->local.vslots = 0;
456:   }
458:   /* allocate entire receive context */
459:   out_from->type      = in_from->type;
460:   ny                  = in_from->starts[in_from->n];
461:   out_from->n         = in_from->n;
462:   out_from->sendfirst = in_from->sendfirst;
464:   PetscMalloc(out_from->n*sizeof(MPI_Request),&out_from->requests);
465:   PetscMalloc4(ny*bs,PetscScalar,&out_from->values,ny,PetscInt,&out_from->indices,out_from->n+1,PetscInt,&out_from->starts,out_from->n,PetscMPIInt,&out_from->procs);
466:   PetscMemcpy(out_from->indices,in_from->indices,ny*sizeof(PetscInt));
467:   PetscMemcpy(out_from->starts,in_from->starts,(out_from->n+1)*sizeof(PetscInt));
468:   PetscMemcpy(out_from->procs,in_from->procs,(out_from->n)*sizeof(PetscMPIInt));
469:   out->fromdata       = (void*)out_from;
470:   out_from->local.n   = in_from->local.n;
471:   out_from->local.nonmatching_computed = PETSC_FALSE;
472:   out_from->local.n_nonmatching        = 0;
473:   out_from->local.slots_nonmatching    = 0;
475:   out_to->use_alltoallv = out_from->use_alltoallv = PETSC_TRUE;
477:   PetscMalloc2(size,PetscMPIInt,&out_to->counts,size,PetscMPIInt,&out_to->displs);
478:   PetscMemcpy(out_to->counts,in_to->counts,size*sizeof(PetscMPIInt));
479:   PetscMemcpy(out_to->displs,in_to->displs,size*sizeof(PetscMPIInt));
481:   PetscMalloc2(size,PetscMPIInt,&out_from->counts,size,PetscMPIInt,&out_from->displs);
482:   PetscMemcpy(out_from->counts,in_from->counts,size*sizeof(PetscMPIInt));
483:   PetscMemcpy(out_from->displs,in_from->displs,size*sizeof(PetscMPIInt));
484:   return(0);
485: }
486: /* --------------------------------------------------------------------------------------------------
487:     Packs and unpacks the message data into send or from receive buffers. 
489:     These could be generated automatically. 
491:     Fortran kernels etc. could be used.
492: */
493: PETSC_STATIC_INLINE void Pack_1(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,PetscScalar *y)
494: {
495:   PetscInt i;
496:   for (i=0; i<n; i++) {
497:     y[i]  = x[indicesx[i]];
498:   }
499: }
500: PETSC_STATIC_INLINE void UnPack_1(PetscInt n,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv)
501: {
502:   PetscInt i;
503:   switch (addv) {
504:   case INSERT_VALUES:
505:   case INSERT_ALL_VALUES:
506:     for (i=0; i<n; i++) {
507:       y[indicesy[i]] = x[i];
508:     }
509:     break;
510:   case ADD_VALUES:
511:   case ADD_ALL_VALUES:
512:     for (i=0; i<n; i++) {
513:       y[indicesy[i]] += x[i];
514:     }
515:     break;
516: #if !defined(PETSC_USE_COMPLEX)
517:   case MAX_VALUES:
518:     for (i=0; i<n; i++) {
519:       y[indicesy[i]] = PetscMax(y[indicesy[i]],x[i]);
520:     }
521: #else
522:   case MAX_VALUES:
523: #endif
524:   case NOT_SET_VALUES:
525:     break;
526:   }
527: }
529: PETSC_STATIC_INLINE void Scatter_1(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv)
530: {
531:   PetscInt i;
532:   switch (addv) {
533:   case INSERT_VALUES:
534:   case INSERT_ALL_VALUES:
535:     for (i=0; i<n; i++) {
536:       y[indicesy[i]] = x[indicesx[i]];
537:     }
538:     break;
539:   case ADD_VALUES:
540:   case ADD_ALL_VALUES:
541:     for (i=0; i<n; i++) {
542:       y[indicesy[i]] += x[indicesx[i]];
543:     }
544:     break;
545: #if !defined(PETSC_USE_COMPLEX)
546:   case MAX_VALUES:
547:     for (i=0; i<n; i++) {
548:       y[indicesy[i]] = PetscMax(y[indicesy[i]],x[indicesx[i]]);
549:     }
550: #else
551:   case MAX_VALUES:
552: #endif
553:   case NOT_SET_VALUES:
554:     break;
555:   }
556: }
558:   /* ----------------------------------------------------------------------------------------------- */
559: PETSC_STATIC_INLINE void Pack_2(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,PetscScalar *y)
560: {
561:   PetscInt i,idx;
563:   for (i=0; i<n; i++) {
564:     idx   = *indicesx++;
565:     y[0]  = x[idx];
566:     y[1]  = x[idx+1];
567:     y    += 2;
568:   }
569: }
570: PETSC_STATIC_INLINE void UnPack_2(PetscInt n,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv)
571: {
572:   PetscInt i,idy;
574:   switch (addv) {
575:   case INSERT_VALUES:
576:   case INSERT_ALL_VALUES:
577:     for (i=0; i<n; i++) {
578:       idy       = *indicesy++;
579:       y[idy]    = x[0];
580:       y[idy+1]  = x[1];
581:       x    += 2;
582:     }
583:     break;
584:   case ADD_VALUES:
585:   case ADD_ALL_VALUES:
586:     for (i=0; i<n; i++) {
587:       idy       = *indicesy++;
588:       y[idy]    += x[0];
589:       y[idy+1]  += x[1];
590:       x    += 2;
591:     }
592:     break;
593: #if !defined(PETSC_USE_COMPLEX)
594:   case MAX_VALUES:
595:     for (i=0; i<n; i++) {
596:       idy       = *indicesy++;
597:       y[idy]    = PetscMax(y[idy],x[0]);
598:       y[idy+1]  = PetscMax(y[idy+1],x[1]);
599:       x    += 2;
600:     }
601: #else
602:   case MAX_VALUES:
603: #endif
604:   case NOT_SET_VALUES:
605:     break;
606:   }
607: }
609: PETSC_STATIC_INLINE void Scatter_2(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv)
610: {
611:   PetscInt i,idx,idy;
613:   switch (addv) {
614:   case INSERT_VALUES:
615:   case INSERT_ALL_VALUES:
616:     for (i=0; i<n; i++) {
617:       idx       = *indicesx++;
618:       idy       = *indicesy++;
619:       y[idy]    = x[idx];
620:       y[idy+1]  = x[idx+1];
621:     }
622:     break;
623:   case ADD_VALUES:
624:   case ADD_ALL_VALUES:
625:     for (i=0; i<n; i++) {
626:       idx       = *indicesx++;
627:       idy       = *indicesy++;
628:       y[idy]    += x[idx];
629:       y[idy+1]  += x[idx+1];
630:     }
631:     break;
632: #if !defined(PETSC_USE_COMPLEX)
633:   case MAX_VALUES:
634:     for (i=0; i<n; i++) {
635:       idx       = *indicesx++;
636:       idy       = *indicesy++;
637:       y[idy]    = PetscMax(y[idy],x[idx]);
638:       y[idy+1]  = PetscMax(y[idy+1],x[idx+1]);
639:     }
640: #else
641:   case MAX_VALUES:
642: #endif
643:   case NOT_SET_VALUES:
644:     break;
645:   }
646: }
647:   /* ----------------------------------------------------------------------------------------------- */
648: PETSC_STATIC_INLINE void Pack_3(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,PetscScalar *y)
649: {
650:   PetscInt i,idx;
652:   for (i=0; i<n; i++) {
653:     idx   = *indicesx++;
654:     y[0]  = x[idx];
655:     y[1]  = x[idx+1];
656:     y[2]  = x[idx+2];
657:     y    += 3;
658:   }
659: }
660: PETSC_STATIC_INLINE void UnPack_3(PetscInt n,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv)
661: {
662:   PetscInt i,idy;
664:   switch (addv) {
665:   case INSERT_VALUES:
666:   case INSERT_ALL_VALUES:
667:     for (i=0; i<n; i++) {
668:       idy       = *indicesy++;
669:       y[idy]    = x[0];
670:       y[idy+1]  = x[1];
671:       y[idy+2]  = x[2];
672:       x    += 3;
673:     }
674:     break;
675:   case ADD_VALUES:
676:   case ADD_ALL_VALUES:
677:     for (i=0; i<n; i++) {
678:       idy       = *indicesy++;
679:       y[idy]    += x[0];
680:       y[idy+1]  += x[1];
681:       y[idy+2]  += x[2];
682:       x    += 3;
683:     }
684:     break;
685: #if !defined(PETSC_USE_COMPLEX)
686:   case MAX_VALUES:
687:     for (i=0; i<n; i++) {
688:       idy       = *indicesy++;
689:       y[idy]    = PetscMax(y[idy],x[0]);
690:       y[idy+1]  = PetscMax(y[idy+1],x[1]);
691:       y[idy+2]  = PetscMax(y[idy+2],x[2]);
692:       x    += 3;
693:     }
694: #else
695:   case MAX_VALUES:
696: #endif
697:   case NOT_SET_VALUES:
698:     break;
699:   }
700: }
702: PETSC_STATIC_INLINE void Scatter_3(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv)
703: {
704:   PetscInt i,idx,idy;
706:   switch (addv) {
707:   case INSERT_VALUES:
708:   case INSERT_ALL_VALUES:
709:     for (i=0; i<n; i++) {
710:       idx       = *indicesx++;
711:       idy       = *indicesy++;
712:       y[idy]    = x[idx];
713:       y[idy+1]  = x[idx+1];
714:       y[idy+2]  = x[idx+2];
715:     }
716:     break;
717:   case ADD_VALUES:
718:   case ADD_ALL_VALUES:
719:     for (i=0; i<n; i++) {
720:       idx       = *indicesx++;
721:       idy       = *indicesy++;
722:       y[idy]    += x[idx];
723:       y[idy+1]  += x[idx+1];
724:       y[idy+2]  += x[idx+2];
725:     }
726:     break;
727: #if !defined(PETSC_USE_COMPLEX)
728:   case MAX_VALUES:
729:     for (i=0; i<n; i++) {
730:       idx       = *indicesx++;
731:       idy       = *indicesy++;
732:       y[idy]    = PetscMax(y[idy],x[idx]);
733:       y[idy+1]  = PetscMax(y[idy+1],x[idx+1]);
734:       y[idy+2]  = PetscMax(y[idy+2],x[idx+2]);
735:     }
736: #else
737:   case MAX_VALUES:
738: #endif
739:   case NOT_SET_VALUES:
740:     break;
741:   }
742: }
743:   /* ----------------------------------------------------------------------------------------------- */
744: PETSC_STATIC_INLINE void Pack_4(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,PetscScalar *y)
745: {
746:   PetscInt i,idx;
748:   for (i=0; i<n; i++) {
749:     idx   = *indicesx++;
750:     y[0]  = x[idx];
751:     y[1]  = x[idx+1];
752:     y[2]  = x[idx+2];
753:     y[3]  = x[idx+3];
754:     y    += 4;
755:   }
756: }
757: PETSC_STATIC_INLINE void UnPack_4(PetscInt n,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv)
758: {
759:   PetscInt i,idy;
761:   switch (addv) {
762:   case INSERT_VALUES:
763:   case INSERT_ALL_VALUES:
764:     for (i=0; i<n; i++) {
765:       idy       = *indicesy++;
766:       y[idy]    = x[0];
767:       y[idy+1]  = x[1];
768:       y[idy+2]  = x[2];
769:       y[idy+3]  = x[3];
770:       x    += 4;
771:     }
772:     break;
773:   case ADD_VALUES:
774:   case ADD_ALL_VALUES:
775:     for (i=0; i<n; i++) {
776:       idy       = *indicesy++;
777:       y[idy]    += x[0];
778:       y[idy+1]  += x[1];
779:       y[idy+2]  += x[2];
780:       y[idy+3]  += x[3];
781:       x    += 4;
782:     }
783:     break;
784: #if !defined(PETSC_USE_COMPLEX)
785:   case MAX_VALUES:
786:     for (i=0; i<n; i++) {
787:       idy       = *indicesy++;
788:       y[idy]    = PetscMax(y[idy],x[0]);
789:       y[idy+1]  = PetscMax(y[idy+1],x[1]);
790:       y[idy+2]  = PetscMax(y[idy+2],x[2]);
791:       y[idy+3]  = PetscMax(y[idy+3],x[3]);
792:       x    += 4;
793:     }
794: #else
795:   case MAX_VALUES:
796: #endif
797:   case NOT_SET_VALUES:
798:     break;
799:   }
800: }
802: PETSC_STATIC_INLINE void Scatter_4(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv)
803: {
804:   PetscInt i,idx,idy;
806:   switch (addv) {
807:   case INSERT_VALUES:
808:   case INSERT_ALL_VALUES:
809:     for (i=0; i<n; i++) {
810:       idx       = *indicesx++;
811:       idy       = *indicesy++;
812:       y[idy]    = x[idx];
813:       y[idy+1]  = x[idx+1];
814:       y[idy+2]  = x[idx+2];
815:       y[idy+3]  = x[idx+3];
816:     }
817:     break;
818:   case ADD_VALUES:
819:   case ADD_ALL_VALUES:
820:     for (i=0; i<n; i++) {
821:       idx       = *indicesx++;
822:       idy       = *indicesy++;
823:       y[idy]    += x[idx];
824:       y[idy+1]  += x[idx+1];
825:       y[idy+2]  += x[idx+2];
826:       y[idy+3]  += x[idx+3];
827:     }
828:     break;
829: #if !defined(PETSC_USE_COMPLEX)
830:   case MAX_VALUES:
831:     for (i=0; i<n; i++) {
832:       idx       = *indicesx++;
833:       idy       = *indicesy++;
834:       y[idy]    = PetscMax(y[idy],x[idx]);
835:       y[idy+1]  = PetscMax(y[idy+1],x[idx+1]);
836:       y[idy+2]  = PetscMax(y[idy+2],x[idx+2]);
837:       y[idy+3]  = PetscMax(y[idy+3],x[idx+3]);
838:     }
839: #else
840:   case MAX_VALUES:
841: #endif
842:   case NOT_SET_VALUES:
843:     break;
844:   }
845: }
846:   /* ----------------------------------------------------------------------------------------------- */
847: PETSC_STATIC_INLINE void Pack_5(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,PetscScalar *y)
848: {
849:   PetscInt i,idx;
851:   for (i=0; i<n; i++) {
852:     idx   = *indicesx++;
853:     y[0]  = x[idx];
854:     y[1]  = x[idx+1];
855:     y[2]  = x[idx+2];
856:     y[3]  = x[idx+3];
857:     y[4]  = x[idx+4];
858:     y    += 5;
859:   }
860: }
861: PETSC_STATIC_INLINE void UnPack_5(PetscInt n,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv)
862: {
863:   PetscInt i,idy;
865:   switch (addv) {
866:   case INSERT_VALUES:
867:   case INSERT_ALL_VALUES:
868:     for (i=0; i<n; i++) {
869:       idy       = *indicesy++;
870:       y[idy]    = x[0];
871:       y[idy+1]  = x[1];
872:       y[idy+2]  = x[2];
873:       y[idy+3]  = x[3];
874:       y[idy+4]  = x[4];
875:       x    += 5;
876:     }
877:     break;
878:   case ADD_VALUES:
879:   case ADD_ALL_VALUES:
880:     for (i=0; i<n; i++) {
881:       idy       = *indicesy++;
882:       y[idy]    += x[0];
883:       y[idy+1]  += x[1];
884:       y[idy+2]  += x[2];
885:       y[idy+3]  += x[3];
886:       y[idy+4]  += x[4];
887:       x    += 5;
888:     }
889:     break;
890: #if !defined(PETSC_USE_COMPLEX)
891:   case MAX_VALUES:
892:     for (i=0; i<n; i++) {
893:       idy       = *indicesy++;
894:       y[idy]    = PetscMax(y[idy],x[0]);
895:       y[idy+1]  = PetscMax(y[idy+1],x[1]);
896:       y[idy+2]  = PetscMax(y[idy+2],x[2]);
897:       y[idy+3]  = PetscMax(y[idy+3],x[3]);
898:       y[idy+4]  = PetscMax(y[idy+4],x[4]);
899:       x    += 5;
900:     }
901: #else
902:   case MAX_VALUES:
903: #endif
904:   case NOT_SET_VALUES:
905:     break;
906:   }
907: }
909: PETSC_STATIC_INLINE void Scatter_5(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv)
910: {
911:   PetscInt i,idx,idy;
913:   switch (addv) {
914:   case INSERT_VALUES:
915:   case INSERT_ALL_VALUES:
916:     for (i=0; i<n; i++) {
917:       idx       = *indicesx++;
918:       idy       = *indicesy++;
919:       y[idy]    = x[idx];
920:       y[idy+1]  = x[idx+1];
921:       y[idy+2]  = x[idx+2];
922:       y[idy+3]  = x[idx+3];
923:       y[idy+4]  = x[idx+4];
924:     }
925:     break;
926:   case ADD_VALUES:
927:   case ADD_ALL_VALUES:
928:     for (i=0; i<n; i++) {
929:       idx       = *indicesx++;
930:       idy       = *indicesy++;
931:       y[idy]    += x[idx];
932:       y[idy+1]  += x[idx+1];
933:       y[idy+2]  += x[idx+2];
934:       y[idy+3]  += x[idx+3];
935:       y[idy+4]  += x[idx+4];
936:     }
937:     break;
938: #if !defined(PETSC_USE_COMPLEX)
939:   case MAX_VALUES:
940:     for (i=0; i<n; i++) {
941:       idx       = *indicesx++;
942:       idy       = *indicesy++;
943:       y[idy]    = PetscMax(y[idy],x[idx]);
944:       y[idy+1]  = PetscMax(y[idy+1],x[idx+1]);
945:       y[idy+2]  = PetscMax(y[idy+2],x[idx+2]);
946:       y[idy+3]  = PetscMax(y[idy+3],x[idx+3]);
947:       y[idy+4]  = PetscMax(y[idy+4],x[idx+4]);
948:     }
949: #else
950:   case MAX_VALUES:
951: #endif
952:   case NOT_SET_VALUES:
953:     break;
954:   }
955: }
956:   /* ----------------------------------------------------------------------------------------------- */
957: PETSC_STATIC_INLINE void Pack_6(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,PetscScalar *y)
958: {
959:   PetscInt i,idx;
961:   for (i=0; i<n; i++) {
962:     idx   = *indicesx++;
963:     y[0]  = x[idx];
964:     y[1]  = x[idx+1];
965:     y[2]  = x[idx+2];
966:     y[3]  = x[idx+3];
967:     y[4]  = x[idx+4];
968:     y[5]  = x[idx+5];
969:     y    += 6;
970:   }
971: }
972: PETSC_STATIC_INLINE void UnPack_6(PetscInt n,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv)
973: {
974:   PetscInt i,idy;
976:   switch (addv) {
977:   case INSERT_VALUES:
978:   case INSERT_ALL_VALUES:
979:     for (i=0; i<n; i++) {
980:       idy       = *indicesy++;
981:       y[idy]    = x[0];
982:       y[idy+1]  = x[1];
983:       y[idy+2]  = x[2];
984:       y[idy+3]  = x[3];
985:       y[idy+4]  = x[4];
986:       y[idy+5]  = x[5];
987:       x    += 6;
988:     }
989:     break;
990:   case ADD_VALUES:
991:   case ADD_ALL_VALUES:
992:     for (i=0; i<n; i++) {
993:       idy       = *indicesy++;
994:       y[idy]    += x[0];
995:       y[idy+1]  += x[1];
996:       y[idy+2]  += x[2];
997:       y[idy+3]  += x[3];
998:       y[idy+4]  += x[4];
999:       y[idy+5]  += x[5];
1000:       x    += 6;
1001:     }
1002:     break;
1003: #if !defined(PETSC_USE_COMPLEX)
1004:   case MAX_VALUES:
1005:     for (i=0; i<n; i++) {
1006:       idy       = *indicesy++;
1007:       y[idy]    = PetscMax(y[idy],x[0]);
1008:       y[idy+1]  = PetscMax(y[idy+1],x[1]);
1009:       y[idy+2]  = PetscMax(y[idy+2],x[2]);
1010:       y[idy+3]  = PetscMax(y[idy+3],x[3]);
1011:       y[idy+4]  = PetscMax(y[idy+4],x[4]);
1012:       y[idy+5]  = PetscMax(y[idy+5],x[5]);
1013:       x    += 6;
1014:     }
1015: #else
1016:   case MAX_VALUES:
1017: #endif
1018:   case NOT_SET_VALUES:
1019:     break;
1020:   }
1021: }
1023: PETSC_STATIC_INLINE void Scatter_6(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv)
1024: {
1025:   PetscInt i,idx,idy;
1027:   switch (addv) {
1028:   case INSERT_VALUES:
1029:   case INSERT_ALL_VALUES:
1030:     for (i=0; i<n; i++) {
1031:       idx       = *indicesx++;
1032:       idy       = *indicesy++;
1033:       y[idy]    = x[idx];
1034:       y[idy+1]  = x[idx+1];
1035:       y[idy+2]  = x[idx+2];
1036:       y[idy+3]  = x[idx+3];
1037:       y[idy+4]  = x[idx+4];
1038:       y[idy+5]  = x[idx+5];
1039:     }
1040:     break;
1041:   case ADD_VALUES:
1042:   case ADD_ALL_VALUES:
1043:     for (i=0; i<n; i++) {
1044:       idx       = *indicesx++;
1045:       idy       = *indicesy++;
1046:       y[idy]    += x[idx];
1047:       y[idy+1]  += x[idx+1];
1048:       y[idy+2]  += x[idx+2];
1049:       y[idy+3]  += x[idx+3];
1050:       y[idy+4]  += x[idx+4];
1051:       y[idy+5]  += x[idx+5];
1052:     }
1053:     break;
1054: #if !defined(PETSC_USE_COMPLEX)
1055:   case MAX_VALUES:
1056:     for (i=0; i<n; i++) {
1057:       idx       = *indicesx++;
1058:       idy       = *indicesy++;
1059:       y[idy]    = PetscMax(y[idy],x[idx]);
1060:       y[idy+1]  = PetscMax(y[idy+1],x[idx+1]);
1061:       y[idy+2]  = PetscMax(y[idy+2],x[idx+2]);
1062:       y[idy+3]  = PetscMax(y[idy+3],x[idx+3]);
1063:       y[idy+4]  = PetscMax(y[idy+4],x[idx+4]);
1064:       y[idy+5]  = PetscMax(y[idy+5],x[idx+5]);
1065:     }
1066: #else
1067:   case MAX_VALUES:
1068: #endif
1069:   case NOT_SET_VALUES:
1070:     break;
1071:   }
1072: }
1073:   /* ----------------------------------------------------------------------------------------------- */
1074: PETSC_STATIC_INLINE void Pack_7(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,PetscScalar *y)
1075: {
1076:   PetscInt i,idx;
1078:   for (i=0; i<n; i++) {
1079:     idx   = *indicesx++;
1080:     y[0]  = x[idx];
1081:     y[1]  = x[idx+1];
1082:     y[2]  = x[idx+2];
1083:     y[3]  = x[idx+3];
1084:     y[4]  = x[idx+4];
1085:     y[5]  = x[idx+5];
1086:     y[6]  = x[idx+6];
1087:     y    += 7;
1088:   }
1089: }
1090: PETSC_STATIC_INLINE void UnPack_7(PetscInt n,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv)
1091: {
1092:   PetscInt i,idy;
1094:   switch (addv) {
1095:   case INSERT_VALUES:
1096:   case INSERT_ALL_VALUES:
1097:     for (i=0; i<n; i++) {
1098:       idy       = *indicesy++;
1099:       y[idy]    = x[0];
1100:       y[idy+1]  = x[1];
1101:       y[idy+2]  = x[2];
1102:       y[idy+3]  = x[3];
1103:       y[idy+4]  = x[4];
1104:       y[idy+5]  = x[5];
1105:       y[idy+6]  = x[6];
1106:       x    += 7;
1107:     }
1108:     break;
1109:   case ADD_VALUES:
1110:   case ADD_ALL_VALUES:
1111:     for (i=0; i<n; i++) {
1112:       idy       = *indicesy++;
1113:       y[idy]    += x[0];
1114:       y[idy+1]  += x[1];
1115:       y[idy+2]  += x[2];
1116:       y[idy+3]  += x[3];
1117:       y[idy+4]  += x[4];
1118:       y[idy+5]  += x[5];
1119:       y[idy+6]  += x[6];
1120:       x    += 7;
1121:     }
1122:     break;
1123: #if !defined(PETSC_USE_COMPLEX)
1124:   case MAX_VALUES:
1125:     for (i=0; i<n; i++) {
1126:       idy       = *indicesy++;
1127:       y[idy]    = PetscMax(y[idy],x[0]);
1128:       y[idy+1]  = PetscMax(y[idy+1],x[1]);
1129:       y[idy+2]  = PetscMax(y[idy+2],x[2]);
1130:       y[idy+3]  = PetscMax(y[idy+3],x[3]);
1131:       y[idy+4]  = PetscMax(y[idy+4],x[4]);
1132:       y[idy+5]  = PetscMax(y[idy+5],x[5]);
1133:       y[idy+6]  = PetscMax(y[idy+6],x[6]);
1134:       x    += 7;
1135:     }
1136: #else
1137:   case MAX_VALUES:
1138: #endif
1139:   case NOT_SET_VALUES:
1140:     break;
1141:   }
1142: }
1144: PETSC_STATIC_INLINE void Scatter_7(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv)
1145: {
1146:   PetscInt i,idx,idy;
1148:   switch (addv) {
1149:   case INSERT_VALUES:
1150:   case INSERT_ALL_VALUES:
1151:     for (i=0; i<n; i++) {
1152:       idx       = *indicesx++;
1153:       idy       = *indicesy++;
1154:       y[idy]    = x[idx];
1155:       y[idy+1]  = x[idx+1];
1156:       y[idy+2]  = x[idx+2];
1157:       y[idy+3]  = x[idx+3];
1158:       y[idy+4]  = x[idx+4];
1159:       y[idy+5]  = x[idx+5];
1160:       y[idy+6]  = x[idx+6];
1161:     }
1162:     break;
1163:   case ADD_VALUES:
1164:   case ADD_ALL_VALUES:
1165:     for (i=0; i<n; i++) {
1166:       idx       = *indicesx++;
1167:       idy       = *indicesy++;
1168:       y[idy]    += x[idx];
1169:       y[idy+1]  += x[idx+1];
1170:       y[idy+2]  += x[idx+2];
1171:       y[idy+3]  += x[idx+3];
1172:       y[idy+4]  += x[idx+4];
1173:       y[idy+5]  += x[idx+5];
1174:       y[idy+6]  += x[idx+6];
1175:     }
1176:     break;
1177: #if !defined(PETSC_USE_COMPLEX)
1178:   case MAX_VALUES:
1179:     for (i=0; i<n; i++) {
1180:       idx       = *indicesx++;
1181:       idy       = *indicesy++;
1182:       y[idy]    = PetscMax(y[idy],x[idx]);
1183:       y[idy+1]  = PetscMax(y[idy+1],x[idx+1]);
1184:       y[idy+2]  = PetscMax(y[idy+2],x[idx+2]);
1185:       y[idy+3]  = PetscMax(y[idy+3],x[idx+3]);
1186:       y[idy+4]  = PetscMax(y[idy+4],x[idx+4]);
1187:       y[idy+5]  = PetscMax(y[idy+5],x[idx+5]);
1188:       y[idy+6]  = PetscMax(y[idy+6],x[idx+6]);
1189:     }
1190: #else
1191:   case MAX_VALUES:
1192: #endif
1193:   case NOT_SET_VALUES:
1194:     break;
1195:   }
1196: }
1197:   /* ----------------------------------------------------------------------------------------------- */
1198: PETSC_STATIC_INLINE void Pack_8(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,PetscScalar *y)
1199: {
1200:   PetscInt i,idx;
1202:   for (i=0; i<n; i++) {
1203:     idx   = *indicesx++;
1204:     y[0]  = x[idx];
1205:     y[1]  = x[idx+1];
1206:     y[2]  = x[idx+2];
1207:     y[3]  = x[idx+3];
1208:     y[4]  = x[idx+4];
1209:     y[5]  = x[idx+5];
1210:     y[6]  = x[idx+6];
1211:     y[7]  = x[idx+7];
1212:     y    += 8;
1213:   }
1214: }
1215: PETSC_STATIC_INLINE void UnPack_8(PetscInt n,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv)
1216: {
1217:   PetscInt i,idy;
1219:   switch (addv) {
1220:   case INSERT_VALUES:
1221:   case INSERT_ALL_VALUES:
1222:     for (i=0; i<n; i++) {
1223:       idy       = *indicesy++;
1224:       y[idy]    = x[0];
1225:       y[idy+1]  = x[1];
1226:       y[idy+2]  = x[2];
1227:       y[idy+3]  = x[3];
1228:       y[idy+4]  = x[4];
1229:       y[idy+5]  = x[5];
1230:       y[idy+6]  = x[6];
1231:       y[idy+7]  = x[7];
1232:       x    += 8;
1233:     }
1234:     break;
1235:   case ADD_VALUES:
1236:   case ADD_ALL_VALUES:
1237:     for (i=0; i<n; i++) {
1238:       idy       = *indicesy++;
1239:       y[idy]    += x[0];
1240:       y[idy+1]  += x[1];
1241:       y[idy+2]  += x[2];
1242:       y[idy+3]  += x[3];
1243:       y[idy+4]  += x[4];
1244:       y[idy+5]  += x[5];
1245:       y[idy+6]  += x[6];
1246:       y[idy+7]  += x[7];
1247:       x    += 8;
1248:     }
1249:     break;
1250: #if !defined(PETSC_USE_COMPLEX)
1251:   case MAX_VALUES:
1252:     for (i=0; i<n; i++) {
1253:       idy       = *indicesy++;
1254:       y[idy]    = PetscMax(y[idy],x[0]);
1255:       y[idy+1]  = PetscMax(y[idy+1],x[1]);
1256:       y[idy+2]  = PetscMax(y[idy+2],x[2]);
1257:       y[idy+3]  = PetscMax(y[idy+3],x[3]);
1258:       y[idy+4]  = PetscMax(y[idy+4],x[4]);
1259:       y[idy+5]  = PetscMax(y[idy+5],x[5]);
1260:       y[idy+6]  = PetscMax(y[idy+6],x[6]);
1261:       y[idy+7]  = PetscMax(y[idy+7],x[7]);
1262:       x    += 8;
1263:     }
1264: #else
1265:   case MAX_VALUES:
1266: #endif
1267:   case NOT_SET_VALUES:
1268:     break;
1269:   }
1270: }
1272: PETSC_STATIC_INLINE void Scatter_8(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv)
1273: {
1274:   PetscInt i,idx,idy;
1276:   switch (addv) {
1277:   case INSERT_VALUES:
1278:   case INSERT_ALL_VALUES:
1279:     for (i=0; i<n; i++) {
1280:       idx       = *indicesx++;
1281:       idy       = *indicesy++;
1282:       y[idy]    = x[idx];
1283:       y[idy+1]  = x[idx+1];
1284:       y[idy+2]  = x[idx+2];
1285:       y[idy+3]  = x[idx+3];
1286:       y[idy+4]  = x[idx+4];
1287:       y[idy+5]  = x[idx+5];
1288:       y[idy+6]  = x[idx+6];
1289:       y[idy+7]  = x[idx+7];
1290:     }
1291:     break;
1292:   case ADD_VALUES:
1293:   case ADD_ALL_VALUES:
1294:     for (i=0; i<n; i++) {
1295:       idx       = *indicesx++;
1296:       idy       = *indicesy++;
1297:       y[idy]    += x[idx];
1298:       y[idy+1]  += x[idx+1];
1299:       y[idy+2]  += x[idx+2];
1300:       y[idy+3]  += x[idx+3];
1301:       y[idy+4]  += x[idx+4];
1302:       y[idy+5]  += x[idx+5];
1303:       y[idy+6]  += x[idx+6];
1304:       y[idy+7]  += x[idx+7];
1305:     }
1306:     break;
1307: #if !defined(PETSC_USE_COMPLEX)
1308:   case MAX_VALUES:
1309:     for (i=0; i<n; i++) {
1310:       idx       = *indicesx++;
1311:       idy       = *indicesy++;
1312:       y[idy]    = PetscMax(y[idy],x[idx]);
1313:       y[idy+1]  = PetscMax(y[idy+1],x[idx+1]);
1314:       y[idy+2]  = PetscMax(y[idy+2],x[idx+2]);
1315:       y[idy+3]  = PetscMax(y[idy+3],x[idx+3]);
1316:       y[idy+4]  = PetscMax(y[idy+4],x[idx+4]);
1317:       y[idy+5]  = PetscMax(y[idy+5],x[idx+5]);
1318:       y[idy+6]  = PetscMax(y[idy+6],x[idx+6]);
1319:       y[idy+7]  = PetscMax(y[idy+7],x[idx+7]);
1320:     }
1321: #else
1322:   case MAX_VALUES:
1323: #endif
1324:   case NOT_SET_VALUES:
1325:     break;
1326:   }
1327: }
1329:   /* ----------------------------------------------------------------------------------------------- */
1330: PETSC_STATIC_INLINE void Pack_12(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,PetscScalar *y)
1331: {
1332:   PetscInt i,idx;
1334:   for (i=0; i<n; i++) {
1335:     idx   = *indicesx++;
1336:     y[0]  = x[idx];
1337:     y[1]  = x[idx+1];
1338:     y[2]  = x[idx+2];
1339:     y[3]  = x[idx+3];
1340:     y[4]  = x[idx+4];
1341:     y[5]  = x[idx+5];
1342:     y[6]  = x[idx+6];
1343:     y[7]  = x[idx+7];
1344:     y[8]  = x[idx+8];
1345:     y[9]  = x[idx+9];
1346:     y[10] = x[idx+10];
1347:     y[11] = x[idx+11];
1348:     y    += 12;
1349:   }
1350: }
1351: PETSC_STATIC_INLINE void UnPack_12(PetscInt n,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv)
1352: {
1353:   PetscInt i,idy;
1355:   switch (addv) {
1356:   case INSERT_VALUES:
1357:   case INSERT_ALL_VALUES:
1358:     for (i=0; i<n; i++) {
1359:       idy       = *indicesy++;
1360:       y[idy]    = x[0];
1361:       y[idy+1]  = x[1];
1362:       y[idy+2]  = x[2];
1363:       y[idy+3]  = x[3];
1364:       y[idy+4]  = x[4];
1365:       y[idy+5]  = x[5];
1366:       y[idy+6]  = x[6];
1367:       y[idy+7]  = x[7];
1368:       y[idy+8]  = x[8];
1369:       y[idy+9]  = x[9];
1370:       y[idy+10] = x[10];
1371:       y[idy+11] = x[11];
1372:       x    += 12;
1373:     }
1374:     break;
1375:   case ADD_VALUES:
1376:   case ADD_ALL_VALUES:
1377:     for (i=0; i<n; i++) {
1378:       idy       = *indicesy++;
1379:       y[idy]    += x[0];
1380:       y[idy+1]  += x[1];
1381:       y[idy+2]  += x[2];
1382:       y[idy+3]  += x[3];
1383:       y[idy+4]  += x[4];
1384:       y[idy+5]  += x[5];
1385:       y[idy+6]  += x[6];
1386:       y[idy+7]  += x[7];
1387:       y[idy+8]  += x[8];
1388:       y[idy+9]  += x[9];
1389:       y[idy+10] += x[10];
1390:       y[idy+11] += x[11];
1391:       x    += 12;
1392:     }
1393:     break;
1394: #if !defined(PETSC_USE_COMPLEX)
1395:   case MAX_VALUES:
1396:     for (i=0; i<n; i++) {
1397:       idy       = *indicesy++;
1398:       y[idy]    = PetscMax(y[idy],x[0]);
1399:       y[idy+1]  = PetscMax(y[idy+1],x[1]);
1400:       y[idy+2]  = PetscMax(y[idy+2],x[2]);
1401:       y[idy+3]  = PetscMax(y[idy+3],x[3]);
1402:       y[idy+4]  = PetscMax(y[idy+4],x[4]);
1403:       y[idy+5]  = PetscMax(y[idy+5],x[5]);
1404:       y[idy+6]  = PetscMax(y[idy+6],x[6]);
1405:       y[idy+7]  = PetscMax(y[idy+7],x[7]);
1406:       y[idy+8]  = PetscMax(y[idy+8],x[8]);
1407:       y[idy+9]  = PetscMax(y[idy+9],x[9]);
1408:       y[idy+10] = PetscMax(y[idy+10],x[10]);
1409:       y[idy+11] = PetscMax(y[idy+11],x[11]);
1410:       x    += 12;
1411:     }
1412: #else
1413:   case MAX_VALUES:
1414: #endif
1415:   case NOT_SET_VALUES:
1416:     break;
1417:   }
1418: }
1420: PETSC_STATIC_INLINE void Scatter_12(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv)
1421: {
1422:   PetscInt i,idx,idy;
1424:   switch (addv) {
1425:   case INSERT_VALUES:
1426:   case INSERT_ALL_VALUES:
1427:     for (i=0; i<n; i++) {
1428:       idx       = *indicesx++;
1429:       idy       = *indicesy++;
1430:       y[idy]    = x[idx];
1431:       y[idy+1]  = x[idx+1];
1432:       y[idy+2]  = x[idx+2];
1433:       y[idy+3]  = x[idx+3];
1434:       y[idy+4]  = x[idx+4];
1435:       y[idy+5]  = x[idx+5];
1436:       y[idy+6]  = x[idx+6];
1437:       y[idy+7]  = x[idx+7];
1438:       y[idy+8]  = x[idx+8];
1439:       y[idy+9]  = x[idx+9];
1440:       y[idy+10] = x[idx+10];
1441:       y[idy+11] = x[idx+11];
1442:     }
1443:     break;
1444:   case ADD_VALUES:
1445:   case ADD_ALL_VALUES:
1446:     for (i=0; i<n; i++) {
1447:       idx       = *indicesx++;
1448:       idy       = *indicesy++;
1449:       y[idy]    += x[idx];
1450:       y[idy+1]  += x[idx+1];
1451:       y[idy+2]  += x[idx+2];
1452:       y[idy+3]  += x[idx+3];
1453:       y[idy+4]  += x[idx+4];
1454:       y[idy+5]  += x[idx+5];
1455:       y[idy+6]  += x[idx+6];
1456:       y[idy+7]  += x[idx+7];
1457:       y[idy+8]  += x[idx+8];
1458:       y[idy+9]  += x[idx+9];
1459:       y[idy+10] += x[idx+10];
1460:       y[idy+11] += x[idx+11];
1461:     }
1462:     break;
1463: #if !defined(PETSC_USE_COMPLEX)
1464:   case MAX_VALUES:
1465:     for (i=0; i<n; i++) {
1466:       idx       = *indicesx++;
1467:       idy       = *indicesy++;
1468:       y[idy]    = PetscMax(y[idy],x[idx]);
1469:       y[idy+1]  = PetscMax(y[idy+1],x[idx+1]);
1470:       y[idy+2]  = PetscMax(y[idy+2],x[idx+2]);
1471:       y[idy+3]  = PetscMax(y[idy+3],x[idx+3]);
1472:       y[idy+4]  = PetscMax(y[idy+4],x[idx+4]);
1473:       y[idy+5]  = PetscMax(y[idy+5],x[idx+5]);
1474:       y[idy+6]  = PetscMax(y[idy+6],x[idx+6]);
1475:       y[idy+7]  = PetscMax(y[idy+7],x[idx+7]);
1476:       y[idy+8]  = PetscMax(y[idy+8],x[idx+8]);
1477:       y[idy+9]  = PetscMax(y[idy+9],x[idx+9]);
1478:       y[idy+10] = PetscMax(y[idy+10],x[idx+10]);
1479:       y[idy+11] = PetscMax(y[idy+11],x[idx+11]);
1480:     }
1481: #else
1482:   case MAX_VALUES:
1483: #endif
1484:   case NOT_SET_VALUES:
1485:     break;
1486:   }
1487: }
1489: /* Create the VecScatterBegin/End_P for our chosen block sizes */
1490: #define BS 1
1491: #include <../src/vec/vec/utils/vpscat.h>
1492: #define BS 2
1493: #include <../src/vec/vec/utils/vpscat.h>
1494: #define BS 3
1495: #include <../src/vec/vec/utils/vpscat.h>
1496: #define BS 4
1497: #include <../src/vec/vec/utils/vpscat.h>
1498: #define BS 5
1499: #include <../src/vec/vec/utils/vpscat.h>
1500: #define BS 6
1501: #include <../src/vec/vec/utils/vpscat.h>
1502: #define BS 7
1503: #include <../src/vec/vec/utils/vpscat.h>
1504: #define BS 8
1505: #include <../src/vec/vec/utils/vpscat.h>
1506: #define BS 12
1507: #include <../src/vec/vec/utils/vpscat.h>
1509: /* ==========================================================================================*/
1511: /*              create parallel to sequential scatter context                           */
1513: PetscErrorCode VecScatterCreateCommon_PtoS(VecScatter_MPI_General *,VecScatter_MPI_General *,VecScatter);
1517: /*@
1518:      VecScatterCreateLocal - Creates a VecScatter from a list of messages it must send and receive.
1520:      Collective on VecScatter
1522:    Input Parameters:
1523: +     VecScatter - obtained with VecScatterCreateEmpty()
1524: .     nsends -
1525: .     sendSizes -
1526: .     sendProcs -
1527: .     sendIdx - indices where the sent entries are obtained from (in local, on process numbering), this is one long array of size \sum_{i=0,i<nsends} sendSizes[i]
1528: .     nrecvs - number of receives to expect
1529: .     recvSizes - 
1530: .     recvProcs - processes who are sending to me
1531: .     recvIdx - indices of where received entries are to be put, (in local, on process numbering), this is one long array of size \sum_{i=0,i<nrecvs} recvSizes[i]
1532: -     bs - size of block
1534:      Notes:  sendSizes[] and recvSizes[] cannot have any 0 entries. If you want to support having 0 entries you need
1535:       to change the code below to "compress out" the sendProcs[] and recvProcs[] entries that have 0 entries.
1537:        Probably does not handle sends to self properly. It should remove those from the counts that are used
1538:       in allocating space inside of the from struct
1540:   Level: intermediate
1542: @*/
1543: PetscErrorCode VecScatterCreateLocal(VecScatter ctx,PetscInt nsends,const PetscInt sendSizes[],const PetscInt sendProcs[],const PetscInt sendIdx[],PetscInt nrecvs,const PetscInt recvSizes[],const PetscInt recvProcs[],const PetscInt recvIdx[],PetscInt bs)
1544: {
1545:   VecScatter_MPI_General *from, *to;
1546:   PetscInt               sendSize, recvSize;
1547:   PetscInt               n, i;
1548:   PetscErrorCode         ierr;
1550:   /* allocate entire send scatter context */
1551:   PetscNewLog(ctx,VecScatter_MPI_General,&to);
1552:   to->n = nsends;
1553:   for(n = 0, sendSize = 0; n < to->n; n++) {sendSize += sendSizes[n];}
1554:   PetscMalloc(to->n*sizeof(MPI_Request),&to->requests);
1555:   PetscMalloc4(bs*sendSize,PetscScalar,&to->values,sendSize,PetscInt,&to->indices,to->n+1,PetscInt,&to->starts,to->n,PetscMPIInt,&to->procs);
1556:   PetscMalloc2(PetscMax(to->n,nrecvs),MPI_Status,&to->sstatus,PetscMax(to->n,nrecvs),MPI_Status,&to->rstatus);
1557:   to->starts[0] = 0;
1558:   for(n = 0; n < to->n; n++) {
1559:     if (sendSizes[n] <=0 ) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"sendSizes[n=%D] = %D cannot be less than 1",n,sendSizes[n]);
1560:     to->starts[n+1] = to->starts[n] + sendSizes[n];
1561:     to->procs[n] = sendProcs[n];
1562:     for(i = to->starts[n]; i < to->starts[n]+sendSizes[n]; i++) {
1563:       to->indices[i] = sendIdx[i];
1564:     }
1565:   }
1566:   ctx->todata = (void *) to;
1568:   /* allocate entire receive scatter context */
1569:   PetscNewLog(ctx,VecScatter_MPI_General,&from);
1570:   from->n = nrecvs;
1571:   for(n = 0, recvSize = 0; n < from->n; n++) {recvSize += recvSizes[n];}
1572:   PetscMalloc(from->n*sizeof(MPI_Request),&from->requests);
1573:   PetscMalloc4(bs*recvSize,PetscScalar,&from->values,recvSize,PetscInt,&from->indices,from->n+1,PetscInt,&from->starts,from->n,PetscMPIInt,&from->procs);
1574:   from->starts[0] = 0;
1575:   for(n = 0; n < from->n; n++) {
1576:     if (recvSizes[n] <=0 ) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"recvSizes[n=%D] = %D cannot be less than 1",n,recvSizes[n]);
1577:     from->starts[n+1] = from->starts[n] + recvSizes[n];
1578:     from->procs[n] = recvProcs[n];
1579:     for(i = from->starts[n]; i < from->starts[n]+recvSizes[n]; i++) {
1580:       from->indices[i] = recvIdx[i];
1581:     }
1582:   }
1583:   ctx->fromdata = (void *)from;
1585:   /* No local scatter optimization */
1586:   from->local.n      = 0;
1587:   from->local.vslots = 0;
1588:   to->local.n        = 0;
1589:   to->local.vslots   = 0;
1590:   from->local.nonmatching_computed = PETSC_FALSE;
1591:   from->local.n_nonmatching        = 0;
1592:   from->local.slots_nonmatching    = 0;
1593:   to->local.nonmatching_computed   = PETSC_FALSE;
1594:   to->local.n_nonmatching          = 0;
1595:   to->local.slots_nonmatching      = 0;
1597:   from->type = VEC_SCATTER_MPI_GENERAL;
1598:   to->type   = VEC_SCATTER_MPI_GENERAL;
1599:   from->bs = bs;
1600:   to->bs   = bs;
1601:   VecScatterCreateCommon_PtoS(from, to, ctx);
1603:   /* mark lengths as negative so it won't check local vector lengths */
1604:   ctx->from_n = ctx->to_n = -1;
1605:   return(0);
1606: }
1608: /*
1609:    bs indicates how many elements there are in each block. Normally this would be 1.
1611:    contains check that PetscMPIInt can handle the sizes needed 
1612: */
1615: PetscErrorCode VecScatterCreate_PtoS(PetscInt nx,const PetscInt *inidx,PetscInt ny,const PetscInt *inidy,Vec xin,Vec yin,PetscInt bs,VecScatter ctx)
1616: {
1617:   VecScatter_MPI_General *from,*to;
1618:   PetscMPIInt            size,rank,imdex,tag,n;
1619:   PetscInt               *source = PETSC_NULL,*owners = PETSC_NULL;
1620:   PetscInt               *lowner = PETSC_NULL,*start = PETSC_NULL,lengthy,lengthx;
1621:   PetscMPIInt            *nprocs = PETSC_NULL,nrecvs;
1622:   PetscInt               i,j,idx,nsends;
1623:   PetscInt               *owner = PETSC_NULL,*starts = PETSC_NULL,count,slen;
1624:   PetscInt               *rvalues,*svalues,base,*values,nprocslocal,recvtotal,*rsvalues;
1625:   PetscMPIInt            *onodes1,*olengths1;
1626:   MPI_Comm               comm;
1627:   MPI_Request            *send_waits = PETSC_NULL,*recv_waits = PETSC_NULL;
1628:   MPI_Status             recv_status,*send_status;
1629:   PetscErrorCode         ierr;
1632:   PetscObjectGetNewTag((PetscObject)ctx,&tag);
1633:   PetscObjectGetComm((PetscObject)xin,&comm);
1634:   MPI_Comm_rank(comm,&rank);
1635:   MPI_Comm_size(comm,&size);
1636:   owners = xin->map->range;
1637:   VecGetSize(yin,&lengthy);
1638:   VecGetSize(xin,&lengthx);
1640:   /*  first count number of contributors to each processor */
1641:   PetscMalloc2(size,PetscMPIInt,&nprocs,nx,PetscInt,&owner);
1642:   PetscMemzero(nprocs,size*sizeof(PetscMPIInt));
1644:   j      = 0;
1645:   nsends = 0;
1646:   for (i=0; i<nx; i++) {
1647:     idx = bs*inidx[i];
1648:     if (idx < owners[j]) j = 0;
1649:     for (; j<size; j++) {
1650:       if (idx < owners[j+1]) {
1651:         if (!nprocs[j]++) nsends++;
1652:         owner[i] = j;
1653:         break;
1654:       }
1655:     }
1656:   }
1658:   nprocslocal  = nprocs[rank];
1659:   nprocs[rank] = 0;
1660:   if (nprocslocal) nsends--;
1661:   /* inform other processors of number of messages and max length*/
1662:   PetscGatherNumberOfMessages(comm,PETSC_NULL,nprocs,&nrecvs);
1663:   PetscGatherMessageLengths(comm,nsends,nrecvs,nprocs,&onodes1,&olengths1);
1664:   PetscSortMPIIntWithArray(nrecvs,onodes1,olengths1);
1665:   recvtotal = 0; for (i=0; i<nrecvs; i++) recvtotal += olengths1[i];
1667:   /* post receives:   */
1668:   PetscMalloc3(recvtotal,PetscInt,&rvalues,nrecvs,PetscInt,&source,nrecvs,MPI_Request,&recv_waits);
1669:   count  = 0;
1670:   for (i=0; i<nrecvs; i++) {
1671:     MPI_Irecv((rvalues+count),olengths1[i],MPIU_INT,onodes1[i],tag,comm,recv_waits+i);
1672:     count += olengths1[i];
1673:   }
1675:   /* do sends:
1676:      1) starts[i] gives the starting index in svalues for stuff going to 
1677:      the ith processor
1678:   */
1679:   PetscMalloc3(nx,PetscInt,&svalues,nsends,MPI_Request,&send_waits,size+1,PetscInt,&starts);
1681:   starts[0]  = 0;
1682:   for (i=1; i<size; i++) { starts[i] = starts[i-1] + nprocs[i-1];}
1683:   for (i=0; i<nx; i++) {
1684:     if (owner[i] != rank) {
1685:       svalues[starts[owner[i]]++] = bs*inidx[i];
1686:     }
1687:   }
1688:   starts[0] = 0;
1689:   for (i=1; i<size+1; i++) { starts[i] = starts[i-1] + nprocs[i-1];}
1690:   count = 0;
1691:   for (i=0; i<size; i++) {
1692:     if (nprocs[i]) {
1693:       MPI_Isend(svalues+starts[i],nprocs[i],MPIU_INT,i,tag,comm,send_waits+count++);
1694:     }
1695:   }
1697:   /*  wait on receives */
1698:   count  = nrecvs;
1699:   slen   = 0;
1700:   while (count) {
1701:     MPI_Waitany(nrecvs,recv_waits,&imdex,&recv_status);
1702:     /* unpack receives into our local space */
1703:     MPI_Get_count(&recv_status,MPIU_INT,&n);
1704:     slen += n;
1705:     count--;
1706:   }
1708:   if (slen != recvtotal) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Total message lengths %D not expected %D",slen,recvtotal);
1709: 
1710:   /* allocate entire send scatter context */
1711:   PetscNewLog(ctx,VecScatter_MPI_General,&to);
1712:   to->n = nrecvs;
1713:   PetscMalloc(nrecvs*sizeof(MPI_Request),&to->requests);
1714:   PetscMalloc4(bs*slen,PetscScalar,&to->values,slen,PetscInt,&to->indices,nrecvs+1,PetscInt,&to->starts,nrecvs,PetscMPIInt,&to->procs);
1715:   PetscMalloc2(PetscMax(to->n,nsends),MPI_Status,&to->sstatus,PetscMax(to->n,nsends),MPI_Status,&to->rstatus);
1716:   ctx->todata   = (void*)to;
1717:   to->starts[0] = 0;
1719:   if (nrecvs) {
1721:     /* move the data into the send scatter */
1722:     base     = owners[rank];
1723:     rsvalues = rvalues;
1724:     for (i=0; i<nrecvs; i++) {
1725:       to->starts[i+1] = to->starts[i] + olengths1[i];
1726:       to->procs[i]    = onodes1[i];
1727:       values = rsvalues;
1728:       rsvalues += olengths1[i];
1729:       for (j=0; j<olengths1[i]; j++) {
1730:         to->indices[to->starts[i] + j] = values[j] - base;
1731:       }
1732:     }
1733:   }
1734:   PetscFree(olengths1);
1735:   PetscFree(onodes1);
1736:   PetscFree3(rvalues,source,recv_waits);
1738:   /* allocate entire receive scatter context */
1739:   PetscNewLog(ctx,VecScatter_MPI_General,&from);
1740:   from->n = nsends;
1742:   PetscMalloc(nsends*sizeof(MPI_Request),&from->requests);
1743:   PetscMalloc4((ny-nprocslocal)*bs,PetscScalar,&from->values,ny-nprocslocal,PetscInt,&from->indices,nsends+1,PetscInt,&from->starts,from->n,PetscMPIInt,&from->procs);
1744:   ctx->fromdata  = (void*)from;
1746:   /* move data into receive scatter */
1747:   PetscMalloc2(size,PetscInt,&lowner,nsends+1,PetscInt,&start);
1748:   count = 0; from->starts[0] = start[0] = 0;
1749:   for (i=0; i<size; i++) {
1750:     if (nprocs[i]) {
1751:       lowner[i]            = count;
1752:       from->procs[count++] = i;
1753:       from->starts[count]  = start[count] = start[count-1] + nprocs[i];
1754:     }
1755:   }
1757:   for (i=0; i<nx; i++) {
1758:     if (owner[i] != rank) {
1759:       from->indices[start[lowner[owner[i]]]++] = bs*inidy[i];
1760:       if (bs*inidy[i] >= lengthy) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Scattering past end of TO vector");
1761:     }
1762:   }
1763:   PetscFree2(lowner,start);
1764:   PetscFree2(nprocs,owner);
1765: 
1766:   /* wait on sends */
1767:   if (nsends) {
1768:     PetscMalloc(nsends*sizeof(MPI_Status),&send_status);
1769:     MPI_Waitall(nsends,send_waits,send_status);
1770:     PetscFree(send_status);
1771:   }
1772:   PetscFree3(svalues,send_waits,starts);
1774:   if (nprocslocal) {
1775:     PetscInt nt = from->local.n = to->local.n = nprocslocal;
1776:     /* we have a scatter to ourselves */
1777:     PetscMalloc(nt*sizeof(PetscInt),&to->local.vslots);
1778:     PetscMalloc(nt*sizeof(PetscInt),&from->local.vslots);
1779:     nt   = 0;
1780:     for (i=0; i<nx; i++) {
1781:       idx = bs*inidx[i];
1782:       if (idx >= owners[rank] && idx < owners[rank+1]) {
1783:         to->local.vslots[nt]     = idx - owners[rank];
1784:         from->local.vslots[nt++] = bs*inidy[i];
1785:         if (bs*inidy[i] >= lengthy) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Scattering past end of TO vector");
1786:       }
1787:     }
1788:   } else {
1789:     from->local.n      = 0;
1790:     from->local.vslots = 0;
1791:     to->local.n        = 0;
1792:     to->local.vslots   = 0;
1793:   }
1795:   from->local.nonmatching_computed = PETSC_FALSE;
1796:   from->local.n_nonmatching        = 0;
1797:   from->local.slots_nonmatching    = 0;
1798:   to->local.nonmatching_computed   = PETSC_FALSE;
1799:   to->local.n_nonmatching          = 0;
1800:   to->local.slots_nonmatching      = 0;
1802:   from->type = VEC_SCATTER_MPI_GENERAL;
1803:   to->type   = VEC_SCATTER_MPI_GENERAL;
1804:   from->bs = bs;
1805:   to->bs   = bs;
1806:   VecScatterCreateCommon_PtoS(from,to,ctx);
1807:   return(0);
1808: }
1810: /*
1811:    bs indicates how many elements there are in each block. Normally this would be 1.
1812: */
1815: PetscErrorCode VecScatterCreateCommon_PtoS(VecScatter_MPI_General *from,VecScatter_MPI_General *to,VecScatter ctx)
1816: {
1817:   MPI_Comm       comm = ((PetscObject)ctx)->comm;
1818:   PetscMPIInt    tag  = ((PetscObject)ctx)->tag, tagr;
1819:   PetscInt       bs   = to->bs;
1820:   PetscMPIInt    size;
1821:   PetscInt       i, n;
1823: 
1825:   PetscObjectGetNewTag((PetscObject)ctx,&tagr);
1826:   ctx->destroy = VecScatterDestroy_PtoP;
1828:   ctx->reproduce = PETSC_FALSE;
1829:   to->sendfirst  = PETSC_FALSE;
1830:   PetscOptionsGetBool(PETSC_NULL,"-vecscatter_reproduce",&ctx->reproduce,PETSC_NULL);
1831:   PetscOptionsGetBool(PETSC_NULL,"-vecscatter_sendfirst",&to->sendfirst,PETSC_NULL);
1832:   from->sendfirst = to->sendfirst;
1834:   MPI_Comm_size(comm,&size);
1835:   /* check if the receives are ALL going into contiguous locations; if so can skip indexing */
1836:   to->contiq = PETSC_FALSE;
1837:   n = from->starts[from->n];
1838:   from->contiq = PETSC_TRUE;
1839:   for (i=1; i<n; i++) {
1840:     if (from->indices[i] != from->indices[i-1] + bs) {
1841:       from->contiq = PETSC_FALSE;
1842:       break;
1843:     }
1844:   }
1846:   to->use_alltoallv = PETSC_FALSE;
1847:   PetscOptionsGetBool(PETSC_NULL,"-vecscatter_alltoall",&to->use_alltoallv,PETSC_NULL);
1848:   from->use_alltoallv = to->use_alltoallv;
1849:   if (from->use_alltoallv) PetscInfo(ctx,"Using MPI_Alltoallv() for scatter\n");
1850: #if defined(PETSC_HAVE_MPI_ALLTOALLW)  && !defined(PETSC_USE_64BIT_INDICES)
1851:   if (to->use_alltoallv) {
1852:     to->use_alltoallw = PETSC_FALSE;
1853:     PetscOptionsGetBool(PETSC_NULL,"-vecscatter_nopack",&to->use_alltoallw,PETSC_NULL);
1854:   }
1855:   from->use_alltoallw = to->use_alltoallw;
1856:   if (from->use_alltoallw) PetscInfo(ctx,"Using MPI_Alltoallw() for scatter\n");
1857: #endif
1859: #if defined(PETSC_HAVE_MPI_WIN_CREATE)
1860:   to->use_window = PETSC_FALSE;
1861:   PetscOptionsGetBool(PETSC_NULL,"-vecscatter_window",&to->use_window,PETSC_NULL);
1862:   from->use_window = to->use_window;
1863: #endif
1865:   if (to->use_alltoallv) {
1867:     PetscMalloc2(size,PetscMPIInt,&to->counts,size,PetscMPIInt,&to->displs);
1868:     PetscMemzero(to->counts,size*sizeof(PetscMPIInt));
1869:     for (i=0; i<to->n; i++) {
1870:       to->counts[to->procs[i]] = bs*(to->starts[i+1] - to->starts[i]);
1871:     }
1872:     to->displs[0] = 0;
1873:     for (i=1; i<size; i++) {
1874:       to->displs[i] = to->displs[i-1] + to->counts[i-1];
1875:     }
1877:     PetscMalloc2(size,PetscMPIInt,&from->counts,size,PetscMPIInt,&from->displs);
1878:     PetscMemzero(from->counts,size*sizeof(PetscMPIInt));
1879:     for (i=0; i<from->n; i++) {
1880:       from->counts[from->procs[i]] = bs*(from->starts[i+1] - from->starts[i]);
1881:     }
1882:     from->displs[0] = 0;
1883:     for (i=1; i<size; i++) {
1884:       from->displs[i] = from->displs[i-1] + from->counts[i-1];
1885:     }
1886: #if defined(PETSC_HAVE_MPI_ALLTOALLW) && !defined(PETSC_USE_64BIT_INDICES)
1887:     if (to->use_alltoallw) {
1888:       PetscMPIInt mpibs = PetscMPIIntCast(bs), mpilen;
1889:       ctx->packtogether = PETSC_FALSE;
1890:       PetscMalloc3(size,PetscMPIInt,&to->wcounts,size,PetscMPIInt,&to->wdispls,size,MPI_Datatype,&to->types);
1891:       PetscMemzero(to->wcounts,size*sizeof(PetscMPIInt));
1892:       PetscMemzero(to->wdispls,size*sizeof(PetscMPIInt));
1893:       for (i=0; i<size; i++) {
1894:         to->types[i] = MPIU_SCALAR;
1895:       }
1897:       for (i=0; i<to->n; i++) {
1898:         to->wcounts[to->procs[i]] = 1;
1899:         mpilen = PetscMPIIntCast(to->starts[i+1]-to->starts[i]);
1900:         MPI_Type_create_indexed_block(mpilen,mpibs,to->indices+to->starts[i],MPIU_SCALAR,to->types+to->procs[i]);
1901:         MPI_Type_commit(to->types+to->procs[i]);
1902:       }
1903:       PetscMalloc3(size,PetscMPIInt,&from->wcounts,size,PetscMPIInt,&from->wdispls,size,MPI_Datatype,&from->types);
1904:       PetscMemzero(from->wcounts,size*sizeof(PetscMPIInt));
1905:       PetscMemzero(from->wdispls,size*sizeof(PetscMPIInt));
1906:       for (i=0; i<size; i++) {
1907:         from->types[i] = MPIU_SCALAR;
1908:       }
1909:       if (from->contiq) {
1910:         PetscInfo(ctx,"Scattered vector entries are stored contiquously, taking advantage of this with -vecscatter_alltoall\n");
1911:         for (i=0; i<from->n; i++) {
1912:           from->wcounts[from->procs[i]] = bs*(from->starts[i+1] - from->starts[i]);
1913:         }
1914:         if (from->n) from->wdispls[from->procs[0]] = sizeof(PetscScalar)*from->indices[0];
1915:         for (i=1; i<from->n; i++) {
1916:           from->wdispls[from->procs[i]] = from->wdispls[from->procs[i-1]] + sizeof(PetscScalar)*from->wcounts[from->procs[i-1]];
1917:         }
1918:       } else {
1919:         for (i=0; i<from->n; i++) {
1920:           from->wcounts[from->procs[i]] = 1;
1921:           mpilen = PetscMPIIntCast(from->starts[i+1]-from->starts[i]);
1922:           MPI_Type_create_indexed_block(mpilen,mpibs,from->indices+from->starts[i],MPIU_SCALAR,from->types+from->procs[i]);
1923:           MPI_Type_commit(from->types+from->procs[i]);
1924:         }
1925:       }
1926:     } else {
1927:       ctx->copy = VecScatterCopy_PtoP_AllToAll;
1928:     }
1929: #else 
1930:     to->use_alltoallw   = PETSC_FALSE;
1931:     from->use_alltoallw = PETSC_FALSE;
1932:     ctx->copy           = VecScatterCopy_PtoP_AllToAll;
1933: #endif
1934: #if defined(PETSC_HAVE_MPI_WIN_CREATE)
1935:   } else if (to->use_window) {
1936:     PetscMPIInt temptag,winsize;
1937:     MPI_Request *request;
1938:     MPI_Status  *status;
1939: 
1940:     PetscObjectGetNewTag((PetscObject)ctx,&temptag);
1941:     winsize = (to->n ? to->starts[to->n] : 0)*bs*sizeof(PetscScalar);
1942:     MPI_Win_create(to->values ? to->values : MPI_BOTTOM,winsize,sizeof(PetscScalar),MPI_INFO_NULL,comm,&to->window);
1943:     PetscMalloc(to->n*sizeof(PetscInt),&to->winstarts);
1944:     PetscMalloc2(to->n,MPI_Request,&request,to->n,MPI_Status,&status);
1945:     for (i=0; i<to->n; i++) {
1946:       MPI_Irecv(to->winstarts+i,1,MPIU_INT,to->procs[i],temptag,comm,request+i);
1947:     }
1948:     for (i=0; i<from->n; i++) {
1949:       MPI_Send(from->starts+i,1,MPIU_INT,from->procs[i],temptag,comm);
1950:     }
1951:     MPI_Waitall(to->n,request,status);
1952:     PetscFree2(request,status);
1954:     winsize = (from->n ? from->starts[from->n] : 0)*bs*sizeof(PetscScalar);
1955:     MPI_Win_create(from->values ? from->values : MPI_BOTTOM,winsize,sizeof(PetscScalar),MPI_INFO_NULL,comm,&from->window);
1956:     PetscMalloc(from->n*sizeof(PetscInt),&from->winstarts);
1957:     PetscMalloc2(from->n,MPI_Request,&request,from->n,MPI_Status,&status);
1958:     for (i=0; i<from->n; i++) {
1959:       MPI_Irecv(from->winstarts+i,1,MPIU_INT,from->procs[i],temptag,comm,request+i);
1960:     }
1961:     for (i=0; i<to->n; i++) {
1962:       MPI_Send(to->starts+i,1,MPIU_INT,to->procs[i],temptag,comm);
1963:     }
1964:     MPI_Waitall(from->n,request,status);
1965:     PetscFree2(request,status);
1966: #endif
1967:   } else {
1968:     PetscBool   use_rsend = PETSC_FALSE, use_ssend = PETSC_FALSE;
1969:     PetscInt    *sstarts = to->starts,  *rstarts = from->starts;
1970:     PetscMPIInt *sprocs  = to->procs,   *rprocs  = from->procs;
1971:     MPI_Request *swaits  = to->requests,*rwaits  = from->requests;
1972:     MPI_Request *rev_swaits,*rev_rwaits;
1973:     PetscScalar *Ssvalues = to->values, *Srvalues = from->values;
1975:     /* allocate additional wait variables for the "reverse" scatter */
1976:     PetscMalloc(to->n*sizeof(MPI_Request),&rev_rwaits);
1977:     PetscMalloc(from->n*sizeof(MPI_Request),&rev_swaits);
1978:     to->rev_requests   = rev_rwaits;
1979:     from->rev_requests = rev_swaits;
1981:     /* Register the receives that you will use later (sends for scatter reverse) */
1982:     PetscOptionsGetBool(PETSC_NULL,"-vecscatter_rsend",&use_rsend,PETSC_NULL);
1983:     PetscOptionsGetBool(PETSC_NULL,"-vecscatter_ssend",&use_ssend,PETSC_NULL);
1984:     if (use_rsend) {
1985:       PetscInfo(ctx,"Using VecScatter ready receiver mode\n");
1986:       to->use_readyreceiver    = PETSC_TRUE;
1987:       from->use_readyreceiver  = PETSC_TRUE;
1988:     } else {
1989:       to->use_readyreceiver    = PETSC_FALSE;
1990:       from->use_readyreceiver  = PETSC_FALSE;
1991:     }
1992:     if (use_ssend) {
1993:       PetscInfo(ctx,"Using VecScatter Ssend mode\n");
1994:     }
1996:     for (i=0; i<from->n; i++) {
1997:       if (use_rsend) {
1998:         MPI_Rsend_init(Srvalues+bs*rstarts[i],bs*rstarts[i+1]-bs*rstarts[i],MPIU_SCALAR,rprocs[i],tagr,comm,rev_swaits+i);
1999:       } else if (use_ssend) {
2000:         MPI_Ssend_init(Srvalues+bs*rstarts[i],bs*rstarts[i+1]-bs*rstarts[i],MPIU_SCALAR,rprocs[i],tagr,comm,rev_swaits+i);
2001:       } else {
2002:         MPI_Send_init(Srvalues+bs*rstarts[i],bs*rstarts[i+1]-bs*rstarts[i],MPIU_SCALAR,rprocs[i],tagr,comm,rev_swaits+i);
2003:       }
2004:     }
2006:     for (i=0; i<to->n; i++) {
2007:       if (use_rsend) {
2008:         MPI_Rsend_init(Ssvalues+bs*sstarts[i],bs*sstarts[i+1]-bs*sstarts[i],MPIU_SCALAR,sprocs[i],tag,comm,swaits+i);
2009:       } else if (use_ssend) {
2010:         MPI_Ssend_init(Ssvalues+bs*sstarts[i],bs*sstarts[i+1]-bs*sstarts[i],MPIU_SCALAR,sprocs[i],tag,comm,swaits+i);
2011:       } else {
2012:         MPI_Send_init(Ssvalues+bs*sstarts[i],bs*sstarts[i+1]-bs*sstarts[i],MPIU_SCALAR,sprocs[i],tag,comm,swaits+i);
2013:       }
2014:     }
2015:     /* Register receives for scatter and reverse */
2016:     for (i=0; i<from->n; i++) {
2017:       MPI_Recv_init(Srvalues+bs*rstarts[i],bs*rstarts[i+1]-bs*rstarts[i],MPIU_SCALAR,rprocs[i],tag,comm,rwaits+i);
2018:     }
2019:     for (i=0; i<to->n; i++) {
2020:       MPI_Recv_init(Ssvalues+bs*sstarts[i],bs*sstarts[i+1]-bs*sstarts[i],MPIU_SCALAR,sprocs[i],tagr,comm,rev_rwaits+i);
2021:     }
2022:     if (use_rsend) {
2023:       if (to->n)   {MPI_Startall_irecv(to->starts[to->n]*to->bs,to->n,to->rev_requests);}
2024:       if (from->n) {MPI_Startall_irecv(from->starts[from->n]*from->bs,from->n,from->requests);}
2025:       MPI_Barrier(comm);
2026:     }
2028:     ctx->copy      = VecScatterCopy_PtoP_X;
2029:   }
2030:   PetscInfo1(ctx,"Using blocksize %D scatter\n",bs);
2031:   switch (bs) {
2032:   case 12:
2033:     ctx->begin     = VecScatterBegin_12;
2034:     ctx->end       = VecScatterEnd_12;
2035:     break;
2036:   case 8:
2037:     ctx->begin     = VecScatterBegin_8;
2038:     ctx->end       = VecScatterEnd_8;
2039:     break;
2040:   case 7:
2041:     ctx->begin     = VecScatterBegin_7;
2042:     ctx->end       = VecScatterEnd_7;
2043:     break;
2044:   case 6:
2045:     ctx->begin     = VecScatterBegin_6;
2046:     ctx->end       = VecScatterEnd_6;
2047:     break;
2048:   case 5:
2049:     ctx->begin     = VecScatterBegin_5;
2050:     ctx->end       = VecScatterEnd_5;
2051:     break;
2052:   case 4:
2053:     ctx->begin     = VecScatterBegin_4;
2054:     ctx->end       = VecScatterEnd_4;
2055:     break;
2056:   case 3:
2057:     ctx->begin     = VecScatterBegin_3;
2058:     ctx->end       = VecScatterEnd_3;
2059:     break;
2060:   case 2:
2061:     ctx->begin     = VecScatterBegin_2;
2062:     ctx->end       = VecScatterEnd_2;
2063:     break;
2064:   case 1:
2065:     ctx->begin     = VecScatterBegin_1;
2066:     ctx->end       = VecScatterEnd_1;
2067:     break;
2068:   default:
2069:     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Blocksize not supported");
2070:   }
2071:   ctx->view      = VecScatterView_MPI;
2072:   /* Check if the local scatter is actually a copy; important special case */
2073:   if (to->local.n) {
2074:     VecScatterLocalOptimizeCopy_Private(ctx,&to->local,&from->local,bs);
2075:   }
2076:   return(0);
2077: }
2081: /* ------------------------------------------------------------------------------------*/
2082: /*
2083:          Scatter from local Seq vectors to a parallel vector. 
2084:          Reverses the order of the arguments, calls VecScatterCreate_PtoS() then
2085:          reverses the result.
2086: */
2089: PetscErrorCode VecScatterCreate_StoP(PetscInt nx,const PetscInt *inidx,PetscInt ny,const PetscInt *inidy,Vec xin,Vec yin,PetscInt bs,VecScatter ctx)
2090: {
2091:   PetscErrorCode         ierr;
2092:   MPI_Request            *waits;
2093:   VecScatter_MPI_General *to,*from;
2096:   VecScatterCreate_PtoS(ny,inidy,nx,inidx,yin,xin,bs,ctx);
2097:   to            = (VecScatter_MPI_General*)ctx->fromdata;
2098:   from          = (VecScatter_MPI_General*)ctx->todata;
2099:   ctx->todata   = (void*)to;
2100:   ctx->fromdata = (void*)from;
2101:   /* these two are special, they are ALWAYS stored in to struct */
2102:   to->sstatus   = from->sstatus;
2103:   to->rstatus   = from->rstatus;
2105:   from->sstatus = 0;
2106:   from->rstatus = 0;
2108:   waits              = from->rev_requests;
2109:   from->rev_requests = from->requests;
2110:   from->requests     = waits;
2111:   waits              = to->rev_requests;
2112:   to->rev_requests   = to->requests;
2113:   to->requests       = waits;
2114:   return(0);
2115: }
2117: /* ---------------------------------------------------------------------------------*/
2120: PetscErrorCode VecScatterCreate_PtoP(PetscInt nx,const PetscInt *inidx,PetscInt ny,const PetscInt *inidy,Vec xin,Vec yin,PetscInt bs,VecScatter ctx)
2121: {
2123:   PetscMPIInt    size,rank,tag,imdex,n;
2124:   PetscInt       *owners = xin->map->range;
2125:   PetscMPIInt    *nprocs = PETSC_NULL;
2126:   PetscInt       i,j,idx,nsends,*local_inidx = PETSC_NULL,*local_inidy = PETSC_NULL;
2127:   PetscInt       *owner = PETSC_NULL,*starts = PETSC_NULL,count,slen;
2128:   PetscInt       *rvalues = PETSC_NULL,*svalues = PETSC_NULL,base,*values = PETSC_NULL,*rsvalues,recvtotal,lastidx;
2129:   PetscMPIInt    *onodes1,*olengths1,nrecvs;
2130:   MPI_Comm       comm;
2131:   MPI_Request    *send_waits = PETSC_NULL,*recv_waits = PETSC_NULL;
2132:   MPI_Status     recv_status,*send_status = PETSC_NULL;
2133:   PetscBool      duplicate = PETSC_FALSE;
2134: #if defined(PETSC_USE_DEBUG)
2135:   PetscBool      found = PETSC_FALSE;
2136: #endif
2139:   PetscObjectGetNewTag((PetscObject)ctx,&tag);
2140:   PetscObjectGetComm((PetscObject)xin,&comm);
2141:   MPI_Comm_size(comm,&size);
2142:   MPI_Comm_rank(comm,&rank);
2143:   if (size == 1) {
2144:     VecScatterCreate_StoP(nx,inidx,ny,inidy,xin,yin,bs,ctx);
2145:     return(0);
2146:   }
2148:   /*
2149:      Each processor ships off its inidx[j] and inidy[j] to the appropriate processor
2150:      They then call the StoPScatterCreate()
2151:   */
2152:   /*  first count number of contributors to each processor */
2153:   PetscMalloc3(size,PetscMPIInt,&nprocs,nx,PetscInt,&owner,(size+1),PetscInt,&starts);
2154:   PetscMemzero(nprocs,size*sizeof(PetscMPIInt));
2155:   lastidx = -1;
2156:   j       = 0;
2157:   for (i=0; i<nx; i++) {
2158:     /* if indices are NOT locally sorted, need to start search at the beginning */
2159:     if (lastidx > (idx = bs*inidx[i])) j = 0;
2160:     lastidx = idx;
2161:     for (; j<size; j++) {
2162:       if (idx >= owners[j] && idx < owners[j+1]) {
2163:         nprocs[j]++;
2164:         owner[i] = j;
2165: #if defined(PETSC_USE_DEBUG)
2166:         found = PETSC_TRUE;
2167: #endif
2168:         break;
2169:       }
2170:     }
2171: #if defined(PETSC_USE_DEBUG)
2172:     if (!found) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Index %D out of range",idx);
2173:     found = PETSC_FALSE;
2174: #endif
2175:   }
2176:   nsends = 0;  for (i=0; i<size; i++) { nsends += (nprocs[i] > 0);}
2178:   /* inform other processors of number of messages and max length*/
2179:   PetscGatherNumberOfMessages(comm,PETSC_NULL,nprocs,&nrecvs);
2180:   PetscGatherMessageLengths(comm,nsends,nrecvs,nprocs,&onodes1,&olengths1);
2181:   PetscSortMPIIntWithArray(nrecvs,onodes1,olengths1);
2182:   recvtotal = 0; for (i=0; i<nrecvs; i++) recvtotal += olengths1[i];
2184:   /* post receives:   */
2185:   PetscMalloc5(2*recvtotal,PetscInt,&rvalues,2*nx,PetscInt,&svalues,nrecvs,MPI_Request,&recv_waits,nsends,MPI_Request,&send_waits,nsends,MPI_Status,&send_status);
2187:   count = 0;
2188:   for (i=0; i<nrecvs; i++) {
2189:     MPI_Irecv((rvalues+2*count),2*olengths1[i],MPIU_INT,onodes1[i],tag,comm,recv_waits+i);
2190:     count += olengths1[i];
2191:   }
2192:   PetscFree(onodes1);
2194:   /* do sends:
2195:       1) starts[i] gives the starting index in svalues for stuff going to 
2196:          the ith processor
2197:   */
2198:   starts[0]= 0;
2199:   for (i=1; i<size; i++) { starts[i] = starts[i-1] + nprocs[i-1];}
2200:   for (i=0; i<nx; i++) {
2201:     svalues[2*starts[owner[i]]]       = bs*inidx[i];
2202:     svalues[1 + 2*starts[owner[i]]++] = bs*inidy[i];
2203:   }
2205:   starts[0] = 0;
2206:   for (i=1; i<size+1; i++) { starts[i] = starts[i-1] + nprocs[i-1];}
2207:   count = 0;
2208:   for (i=0; i<size; i++) {
2209:     if (nprocs[i]) {
2210:       MPI_Isend(svalues+2*starts[i],2*nprocs[i],MPIU_INT,i,tag,comm,send_waits+count);
2211:       count++;
2212:     }
2213:   }
2214:   PetscFree3(nprocs,owner,starts);
2216:   /*  wait on receives */
2217:   count = nrecvs;
2218:   slen  = 0;
2219:   while (count) {
2220:     MPI_Waitany(nrecvs,recv_waits,&imdex,&recv_status);
2221:     /* unpack receives into our local space */
2222:     MPI_Get_count(&recv_status,MPIU_INT,&n);
2223:     slen += n/2;
2224:     count--;
2225:   }
2226:   if (slen != recvtotal) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Total message lengths %D not as expected %D",slen,recvtotal);
2227: 
2228:   PetscMalloc2(slen,PetscInt,&local_inidx,slen,PetscInt,&local_inidy);
2229:   base  = owners[rank];
2230:   count = 0;
2231:   rsvalues = rvalues;
2232:   for (i=0; i<nrecvs; i++) {
2233:     values = rsvalues;
2234:     rsvalues += 2*olengths1[i];
2235:     for (j=0; j<olengths1[i]; j++) {
2236:       local_inidx[count]   = values[2*j] - base;
2237:       local_inidy[count++] = values[2*j+1];
2238:     }
2239:   }
2240:   PetscFree(olengths1);
2242:   /* wait on sends */
2243:   if (nsends) {MPI_Waitall(nsends,send_waits,send_status);}
2244:   PetscFree5(rvalues,svalues,recv_waits,send_waits,send_status);
2246:   /*
2247:      should sort and remove duplicates from local_inidx,local_inidy 
2248:   */
2250: #if defined(do_it_slow)
2251:   /* sort on the from index */
2252:   PetscSortIntWithArray(slen,local_inidx,local_inidy);
2253:   start = 0;
2254:   while (start < slen) {
2255:     count = start+1;
2256:     last  = local_inidx[start];
2257:     while (count < slen && last == local_inidx[count]) count++;
2258:     if (count > start + 1) { /* found 2 or more same local_inidx[] in a row */
2259:       /* sort on to index */
2260:       PetscSortInt(count-start,local_inidy+start);
2261:     }
2262:     /* remove duplicates; not most efficient way, but probably good enough */
2263:     i = start;
2264:     while (i < count-1) {
2265:       if (local_inidy[i] != local_inidy[i+1]) {
2266:         i++;
2267:       } else { /* found a duplicate */
2268:         duplicate = PETSC_TRUE;
2269:         for (j=i; j<slen-1; j++) {
2270:           local_inidx[j] = local_inidx[j+1];
2271:           local_inidy[j] = local_inidy[j+1];
2272:         }
2273:         slen--;
2274:         count--;
2275:       }
2276:     }
2277:     start = count;
2278:   }
2279: #endif
2280:   if (duplicate) {
2281:     PetscInfo(ctx,"Duplicate from to indices passed in VecScatterCreate(), they are ignored\n");
2282:   }
2283:   VecScatterCreate_StoP(slen,local_inidx,slen,local_inidy,xin,yin,bs,ctx);
2284:   PetscFree2(local_inidx,local_inidy);
2285:   return(0);
2286: }