revert some things, my mental health is at an all time low

This commit is contained in:
Afonso Franco 2023-10-19 21:19:56 +01:00
parent 66dc92b5f4
commit fbfb16c650
Signed by: afonso
SSH key fingerprint: SHA256:JiuxZNdA5bRWXPMUJChI0AQ75yC+cXY4xM0IaVwEVys
5 changed files with 108 additions and 93 deletions

View file

@ -6,7 +6,10 @@ ts := $(shell /usr/bin/date "+%d-%m__%H_%M_%S")
.DEFAULT_GOAL = MD
MD: $(SRC)/MD.cpp
$(CC) $(CFLAGS) $(SRC)MD.cpp -lm -march=native -mtune=native -mavx -O2 -ftree-vectorize -funroll-loops -pg -g -o ./out/MD
$(CC) $(CFLAGS) $(SRC)MD.cpp -lm -march=native -mavx -O2 -ftree-vectorize -funroll-loops -pg -g -o ./out/MD
MD-errors: $(SRC)/MD.cpp
$(CC) $(CFLAGS) $(SRC)MD.cpp -lm -march=native -mtune=native -mavx -O2 -fopt-info-vec-all -ftree-vectorizer-verbose=2 -ftree-vectorize -funroll-loops -pg -g -o ./out/MD
MDorig: $(SRC)/MD-original.cpp
$(CC) $(CFLAGS) $(SRC)MD-original.cpp -lm -O2 -pg -o ./out/MD-original

BIN
out/MD

Binary file not shown.

BIN
out/MD-orig Executable file

Binary file not shown.

View file

@ -540,7 +540,7 @@ void computeAccelerations() {
}
// returns sum of dv/dt*m/A (aka Pressure) from elastic collisions with walls
double VelocityVerlet(double dt, int iter, double *PE, FILE *fp) {
double VelocityVerlet(double dt, int iter, FILE *fp) {
int i, j, k;
double psum = 0.;

View file

@ -32,6 +32,25 @@
#include <stdlib.h>
#include <string.h>
// Readability
typedef __m256d fourD;
typedef __m128d twoD;
#define load_4d(ptr) _mm256_load_pd(ptr)
#define store_4d(ptr, val) _mm256_store_pd(ptr, val)
#define store_lower_1d(ptr, val) _mm_store_pd(ptr, val)
#define add_4d(a, b) _mm256_add_pd(a, b)
#define add_2d(a, b) _mm_add_pd(a, b)
#define hadd_4d(a, b) _mm256_hadd_pd(a, b)
#define sub_4d(a, b) _mm256_sub_pd(a, b)
#define mul_4d(a, b) _mm256_mul_pd(a, b)
#define div_4d(a, b) _mm256_div_pd(a, b)
#define create_4d(a) _mm256_set1_pd(a)
#define get_lower_2d(a) _mm256_castpd256_pd128(a)
#define get_upper_2d(a) _mm256_extractf128_pd(a, 1)
#define ALIGNMENT 32
// Number of particles
int N;
@ -55,11 +74,8 @@ double Tinit; // 2;
const int MAXPART = 5001;
// Position
double r[MAXPART][3];
// Velocity
double v[MAXPART][3];
// Acceleration
double a[MAXPART][3];
// Force
double F[MAXPART][3];
// atom type
@ -394,6 +410,7 @@ void initialize() {
int n, p, i, j, k;
double pos;
// Number of atoms in each direction
n = int(ceil(pow(N, 1.0 / 3)));
@ -489,96 +506,91 @@ void transposeMatrix2(double matT[MAXPART][3], double mat[3][MAXPART]) {
}
}
double PotentialAndAccelerationSIMD(double dt) {
memset(a, 0, sizeof(a));
double Pot = 0.;
for (int i = 0; i < N; i++) {
__m256d riMinusRj[4];
memset(riMinusRj, 0, sizeof(riMinusRj));
double dist[4];
memset(dist, 0, sizeof(dist));
for (int j = i + 1; j < N; j += 4) {
for (int k = 0; k < 4; k++) {
double quot, rnorm, term1, term2;
// CALCULATE POTENTIAL ENERGY
// double dist = 0.;
// double posItoJ[3]; // position of i relative to j
// POT
// double distTmp = r[i][k] - r[j][k];
// dist += distTmp * distTmp;
// ACCEL
// posItoJ[k] = distTmp;
// r[i]
__m256d ri = _mm256_loadu_pd(r[i]);
// r[j]
__m256d rj = _mm256_loadu_pd(r[j + k]);
// r[i] - r[j]
__m256d rij = _mm256_sub_pd(ri, rj);
riMinusRj[k] = rij;
// riMinusRj * riMinusRj
__m256d distTmp = _mm256_mul_pd(rij, rij);
dist[k] = distTmp[0] + distTmp[1] + distTmp[2];
}
__m256d dists = _mm256_loadu_pd(dist);
__m256d quot = _mm256_div_pd(_mm256_set1_pd(sigma * sigma), dists);
__m256d term2 = _mm256_mul_pd(quot, _mm256_mul_pd(quot, quot));
__m256d Pots =
_mm256_mul_pd(_mm256_set1_pd(epsilon_8),
_mm256_mul_pd(term2, _mm256_sub_pd(term2, _mm256_set1_pd(1.))));
Pot += Pots[0] + Pots[1] + Pots[2] + Pots[3];
// quot = sigma * sigma / dist;
// term2 = quot * quot * quot;
// Pot += epsilon_8 * term2 * (term2 - 1.);
__m256d distSqd = _mm256_mul_pd(dists, _mm256_mul_pd(dists, dists));
__m256d rSqd_inv7 = _mm256_mul_pd(distSqd, _mm256_mul_pd(distSqd, dists));
__m256d f8 = _mm256_div_pd(
_mm256_sub_pd(_mm256_set1_pd(48.), _mm256_mul_pd(_mm256_set1_pd(24.), distSqd)),
rSqd_inv7);
// // From derivative of Lennard-Jones with sigma and epsilon
// // set equal to 1 in natural units!
// double distSqd = dist * dist * dist;
// double rSqd_inv7 = distSqd * distSqd * dist;
// double f = (48. - (24. * distSqd)) / rSqd_inv7;
// Go back to the original loop
for (int k = 0; k < 4; k++) {
__m256d tmp = _mm256_mul_pd(riMinusRj[k], _mm256_set1_pd(f8[k]));
__m256d aI = _mm256_loadu_pd(a[i]);
__m256d aJ = _mm256_loadu_pd(a[i]);
_mm256_storeu_pd(a[i], _mm256_add_pd(aI, tmp));
_mm256_storeu_pd(a[j], _mm256_sub_pd(aJ, tmp));
// // from F = ma, where m = 1 in natural units!
// for (int k = 0; k < 3; k++) {
// double tmp = posItoJ[k] * f;
// a[i][k] += tmp;
// a[j][k] -= tmp;
// }
//
// }
// for (int j = 0; j < 3; j++) {
// v[i][j] += 0.5 * a[i][j] * dt;
// }
}
}
}
//}
return Pot;
}
// double PotentialAndAccelerationSIMD(double dt) {
// for (int i = 0; i < N; i++) {
// for (int j = 0; j < 3; j++)
// a[i][j] = 0.;
// }
// double Pot = 0.;
//
// for (int i = 0; i < N; i++) {
//
// fourD rij4[4];
// memset(rij4, 0, sizeof(rij4));
// double *dist = (double *)_mm_malloc(4 * sizeof(double), ALIGNMENT);
// memset(dist, 0, sizeof(double) * 4);
// //__m256i mask =
// //_mm256_setr_epi64x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0);
//
// for (int j = i + 1; j < N; j += 4) {
// fourD ri = load_4d(r[i]);
// int numParts = fmin(4, N - j);
// for (int k = 0; k < numParts; k++) {
// if (j + k >= N) {
// break;
// }
// double quot, rnorm, term1, term2;
//
// fourD rj = load_4d(r[j + k]);
// fourD rij = sub_4d(ri, rj);
// fourD distTmp = mul_4d(rij, rij);
// rij4[k] = rij;
//
// fourD temp = hadd_4d(distTmp, distTmp);
// twoD lo = get_lower_2d(temp);
// twoD hi = get_upper_2d(temp);
// twoD sum = add_2d(lo, hi);
//
// // Convert the scalar result to a double
// double finalSum;
// store_lower_1d(&dist[k], sum);
// }
// fourD dists = load_4d(dist);
// fourD quot = div_4d(create_4d(sigma * sigma), dists);
// fourD term2 = mul_4d(quot, mul_4d(quot, quot));
// fourD Pots = mul_4d(create_4d(epsilon_8), mul_4d(term2, sub_4d(term2, create_4d(1.))));
// // Perform horizontal addition and store the result in a scalar
// fourD temp = hadd_4d(Pots, Pots);
// twoD lo = get_lower_2d(temp);
// twoD hi = get_upper_2d(temp);
// twoD sum = add_2d(lo, hi);
//
// // Convert the scalar result to a double
// double finalSum;
// store_lower_1d(&finalSum, sum);
// Pot += finalSum;
//
// fourD distSqd = mul_4d(dists, mul_4d(dists, dists));
// fourD rSqd_inv7 = mul_4d(distSqd, mul_4d(distSqd, dists));
// fourD f8 = div_4d(sub_4d(create_4d(48.), mul_4d(create_4d(24.), distSqd)), rSqd_inv7);
//
// // Go back to the original loop
// for (int k = 0; k < numParts; k++) {
// if (j + k >= N) {
// break;
// }
// fourD tmp = mul_4d(rij4[k], create_4d(f8[k]));
// fourD aI = load_4d(a[i]);
// fourD aJ = load_4d(a[j + k]);
// fourD sum = add_4d(aI, tmp);
// fourD sub = sub_4d(aJ, tmp);
// store_4d(a[i], sum);
// store_4d(a[j + k], sub);
// }
// }
// }
// //}
//
// return Pot;
// }
double PotentialAndAcceleration(double dt) {
memset(a, 0, sizeof(a));
double Pot = 0.;
for (int i = 0; i < N; i++) {
for (int i = 0; i < N-1; i++) {
double ri[3] = {r[i][0], r[i][1], r[i][2]};
for (int j = i + 1; j < N; j++) {
double quot, rnorm, term1, term2;
// CALCULATE POTENTIAL ENERGY
@ -587,7 +599,7 @@ double PotentialAndAcceleration(double dt) {
for (int k = 0; k < 3; k++) {
// POT
double distTmp = r[i][k] - r[j][k];
double distTmp = ri[k] - r[j][k];
dist += distTmp * distTmp;
// ACCEL
posItoJ[k] = distTmp;
@ -597,7 +609,7 @@ double PotentialAndAcceleration(double dt) {
double distSqd = dist * dist * dist;
double rSqd_inv7 = distSqd * distSqd * dist;
double f = (48. - (24. * distSqd)) / rSqd_inv7;
// from F = ma, where m = 1 in natural units!
// from F = ma, where m = 1 in natural units!
for (int k = 0; k < 3; k++) {
double tmp = posItoJ[k] * f;
a[i][k] += tmp;
@ -646,8 +658,8 @@ void computeAccelerations() {
int i, j, k;
double f, rSqd, tmp = 0.;
double rij[3]; // position of i relative to j
memset(a, 0, sizeof(a));
memset(a, 0, sizeof(a));
for (i = 0; i < N - 1; i++) { // loop over all distinct pairs i,j
for (j = i + 1; j < N; j++) {
// initialize r^2 to zero