[Bf-blender-cvs] [3f9b692] master: Fluids: improve multithreaded CPU usage.

Sun Oct 2 16:39:39 CEST 2016

Commit: 3f9b69287d287af09c801a5203be07287128b2a0
Author: Brecht Van Lommel
Date:   Sun Oct 2 15:43:02 2016 +0200
Branches: master
https://developer.blender.org/rB3f9b69287d287af09c801a5203be07287128b2a0

Fluids: improve multithreaded CPU usage.

Fixes for clamp-omp, fewer shared variables, fix some cases of threads writing
to the same memory location. Issue found by Jens Verwiebe, who reports 30%
speedup with 16 core CPU, when using this with a recent clang-omp version.

===================================================================

M	intern/elbeem/intern/solver_class.h
M	intern/elbeem/intern/solver_main.cpp

===================================================================

diff --git a/intern/elbeem/intern/solver_class.h b/intern/elbeem/intern/solver_class.h
index 593fea1..2b2e214 100644
--- a/intern/elbeem/intern/solver_class.h
+++ b/intern/elbeem/intern/solver_class.h
@@ -332,7 +332,7 @@ class LbmFsgrSolver :
 		void debugMarkCellCall(int level, int vi,int vj,int vk);
 		
 		// loop over grid, stream&collide update
-		void mainLoop(int lev);
+		void mainLoop(const int lev);
 		// change time step size
 		void adaptTimestep();
 		//! init mObjectSpeeds for current parametrization
diff --git a/intern/elbeem/intern/solver_main.cpp b/intern/elbeem/intern/solver_main.cpp
index 55a8d3e..a338bb7 100644
--- a/intern/elbeem/intern/solver_main.cpp
+++ b/intern/elbeem/intern/solver_main.cpp
@@ -355,7 +355,7 @@ void LbmFsgrSolver::fineAdvance()
 //! fine step function
 /*****************************************************************************/
 void 
-LbmFsgrSolver::mainLoop(int lev)
+LbmFsgrSolver::mainLoop(const int lev)
 {
 	// loops over _only inner_ cells  -----------------------------------------------------------------------------------
 	
@@ -376,13 +376,16 @@ LbmFsgrSolver::mainLoop(int lev)
   // main loop region
 	const bool doReduce = true;
 	const int gridLoopBound=1;
+	const int gDebugLevel = ::gDebugLevel;
+	int calcNumInvIfCells = 0;
+	LbmFloat calcInitialMass = 0;
 	GRID_REGION_INIT();
 #if PARALLEL==1
-#pragma omp parallel default(shared) num_threads(mNumOMPThreads) \
+#pragma omp parallel default(none) num_threads(mNumOMPThreads) \
   reduction(+: \
 	  calcCurrentMass,calcCurrentVolume, \
 		calcCellsFilled,calcCellsEmptied, \
-		calcNumUsedCells )
+		calcNumUsedCells,calcNumInvIfCells,calcInitialMass)
 	GRID_REGION_START();
 #else // PARALLEL==1
 	GRID_REGION_START();
@@ -468,7 +471,7 @@ LbmFsgrSolver::mainLoop(int lev)
 				calcCurrentMass += iniRho; 
 				calcCurrentVolume += 1.0; 
 				calcNumUsedCells++;
-				mInitialMass += iniRho;
+				calcInitialMass += iniRho;
 				// dont treat cell until next step
 				continue;
 			} 
@@ -479,7 +482,7 @@ LbmFsgrSolver::mainLoop(int lev)
 			if(isnotValid) {
 				// remove fluid cells, shouldnt be here anyway
 				LbmFloat fluidRho = m[0]; FORDF1 { fluidRho += m[l]; }
-				mInitialMass -= fluidRho;
+				calcInitialMass -= fluidRho;
 				const LbmFloat iniRho = 0.0;
 				RAC(tcel, dMass) = RAC(tcel, dFfrac) = iniRho;
 				RAC(tcel, dFlux) = FLUX_INIT;
@@ -608,8 +611,8 @@ LbmFsgrSolver::mainLoop(int lev)
 		// read distribution funtions of adjacent cells = stream step
 		DEFAULT_STREAM;
 
-		if((nbored & CFFluid)==0) { newFlag |= CFNoNbFluid; mNumInvIfCells++; }
-		if((nbored & CFEmpty)==0) { newFlag |= CFNoNbEmpty; mNumInvIfCells++; }
+		if((nbored & CFFluid)==0) { newFlag |= CFNoNbFluid; calcNumInvIfCells++; }
+		if((nbored & CFEmpty)==0) { newFlag |= CFNoNbEmpty; calcNumInvIfCells++; }
 
 		// calculate mass exchange for interface cells 
 		LbmFloat myfrac = RAC(ccel,dFfrac);
@@ -809,7 +812,7 @@ LbmFsgrSolver::mainLoop(int lev)
 			// fill if cells in inflow region
 			if(myfrac<0.5) { 
 				mass += 0.25; 
-				mInitialMass += 0.25;
+				calcInitialMass += 0.25;
 			}
 			const int OId = oldFlag>>24;
 			const LbmVec vel(mObjectSpeeds[OId]);
@@ -1013,7 +1016,7 @@ LbmFsgrSolver::mainLoop(int lev)
 		if( (mass) <= (rho * (   -FSGR_MAGICNR)) ) { ifemptied = 1; }
 
 		if(oldFlag & (CFMbndOutflow)) {
-			mInitialMass -= mass;
+			calcInitialMass -= mass;
 			mass = myfrac = 0.0;
 			iffilled = 0; ifemptied = 1;
 		}
@@ -1105,6 +1108,8 @@ LbmFsgrSolver::mainLoop(int lev)
 	mNumFilledCells  = calcCellsFilled;
 	mNumEmptiedCells = calcCellsEmptied;
 	mNumUsedCells = calcNumUsedCells;
+	mNumInvIfCells += calcNumInvIfCells;
+	mInitialMass += calcInitialMass;
 }
 
 
@@ -1115,13 +1120,14 @@ LbmFsgrSolver::preinitGrids()
 	const int lev = mMaxRefine;
 	const bool doReduce = false;
 	const int gridLoopBound=0;
+	const int gDebugLevel = ::gDebugLevel;
 
 	// preinit both grids
 	for(int s=0; s<2; s++) {
 	
 		GRID_REGION_INIT();
 #if PARALLEL==1
-#pragma omp parallel default(shared) num_threads(mNumOMPThreads) \
+#pragma omp parallel default(none) num_threads(mNumOMPThreads) \
   reduction(+: \
 	  calcCurrentMass,calcCurrentVolume, \
 		calcCellsFilled,calcCellsEmptied, \
@@ -1155,10 +1161,11 @@ LbmFsgrSolver::standingFluidPreinit()
 	const int lev = mMaxRefine;
 	const bool doReduce = false;
 	const int gridLoopBound=1;
+	const int gDebugLevel = ::gDebugLevel;
 
 	GRID_REGION_INIT();
 #if PARALLEL==1
-#pragma omp parallel default(shared) num_threads(mNumOMPThreads) \
+#pragma omp parallel default(none) num_threads(mNumOMPThreads) \
   reduction(+: \
 	  calcCurrentMass,calcCurrentVolume, \
 		calcCellsFilled,calcCellsEmptied, \