-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtransport.cu
1236 lines (1024 loc) · 45.7 KB
/
transport.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#include "header.h"
//#include <helper_cuda.h> //YU-modified
//#include <helper_string.h> //YU-modified
//#include <helper_math.h> //YU-modified
#include <float.h> //for FLT_MAX
#include <vector> // for vector
#include <assert.h>
#include <math.h>
// __device__ __constant__ unsigned long long num_photons_dc[1];
// __device__ __constant__ unsigned int n_layers_dc[1];
// __device__ __constant__ unsigned int num_detector_dc[1];
// __device__ __constant__ float start_weight_dc[1];
// __device__ __constant__ float detector_reflectance_dc[1];
// __device__ __constant__ LayerStruct layers_dc[PRESET_NUM_LAYER + 2];
// __device__ __constant__ DetectorInfoStruct detInfo_dc[PRESET_NUM_DETECTOR + 1];
// __device__ __constant__ float critical_angle_dc[PRESET_NUM_DETECTOR + 1];
// __device__ __constant__ bool source_probe_oblique_dc[1];
// __device__ __constant__ bool detector_probe_oblique_dc[1];
int InitMemStructs(MemStruct* HostMem, MemStruct* DeviceMem, SimulationStruct* sim, int num_of_threads);
int InitMemStructs_replay(MemStruct_Replay* HostMem, MemStruct_Replay* DeviceMem, SimulationStruct* sim, int num_of_threads);
void FreeMemStructs(MemStruct* HostMem, MemStruct* DeviceMem);
void FreeMemStructs_replay(MemStruct_Replay* HostMem, MemStruct_Replay* DeviceMem);
void FreeSimulationStruct(SimulationStruct* sim, int n_simulations);
__global__ void MCd(MemStruct DeviceMem, unsigned long long seed, int num_of_threads, int num_of_threads_per_block);
__global__ void MCd_replay(MemStruct DeviceMem, MemStruct_Replay DeviceMem_Replay, int detected_SDS, int num_of_threads_per_block);
int InitDCMem(SimulationStruct* sim);
int Write_Simulation_Results(MemStruct* HostMem, SimulationStruct* sim, clock_t simulation_time);
int read_simulation_data(char* filename, SimulationStruct** simulations, int ignoreAdetection);
int interpret_arg(int argc, char* argv[], unsigned long long* seed, int* ignoreAdetection);
__device__ void LaunchPhoton(PhotonStruct* p, curandState *state);
__global__ void LaunchPhoton_Global(MemStruct DeviceMem, unsigned long long seed, int num_of_threads_per_block);
__device__ void Spin(PhotonStruct*, float, curandState* state);
__device__ unsigned int Reflect(PhotonStruct*, int, curandState* state);
__device__ unsigned int PhotonSurvive(PhotonStruct*, curandState* state);
__device__ void AtomicAddULL(unsigned long long* address, unsigned int add);
__device__ bool detect(PhotonStruct* p, Fibers* f);
__device__ bool detect_replay(PhotonStruct* p, Fibers* f, int detected_SDS);
__device__ int binarySearch(float *data, float value);
void fiber_initialization(Fibers* f, int num_of_threads);
void fiber_initialization_replay(Fibers_Replay* f_r, SimulationStruct* sim, int num_of_threads);
void output_fiber(SimulationStruct* sim, float* reflectance, char* output); //Wang modified
void output_SDS_pathlength(SimulationStruct* simulation, float ***pathlength_weight_arr, int *temp_SDS_detect_num, int SDS_to_output, bool do_output_bin);
void output_sim_summary(SimulationStruct* simulation, SummaryStruct sumStruc, bool do_replay);
//void calculate_reflectance(Fibers* f, float *result, float (*pathlength_weight_arr)[NUM_LAYER + 1][detected_temp_size], int *total_SDS_detect_num, int *temp_SDS_detect_num);
void calculate_reflectance(Fibers* f, float *result, int* total_SDS_detect_num, vector<vector<curandState>>& detected_state_arr, int num_of_threads);
void calculate_reflectance_replay(Fibers_Replay* f_r, float *result, float ***pathlength_weight_arr, int *temp_SDS_detect_num, int *total_SDS_detect_num, int SDS_should_be, int num_layers, int num_of_threads);
void input_g(int index, G_Array *g);
int InitG(G_Array* HostG, G_Array* DeviceG, int index);
void FreeG(G_Array* HostG, G_Array* DeviceG);
void output_A_rz(SimulationStruct* sim, unsigned long long *data, bool do_output_bin);
void output_A0_z(SimulationStruct* sim, unsigned long long *data, bool do_output_bin);
void output_AA_rz(SimulationStruct* sim, unsigned long long *data, bool do_output_bin);
void output_AA0_z(SimulationStruct* sim, unsigned long long *data, bool do_output_bin);
void output_reflectance(SimulationStruct* sim, unsigned long long *data, bool do_output_bin);
void calculate_average_pathlength(double *average_PL, float ***pathlength_weight_arr, SimulationStruct *sim, int *total_SDS_detect_num);
void output_average_pathlength(SimulationStruct* sim, double *average_PL);
// extern __device__ __constant__ unsigned long long num_photons_dc[1];
// extern __device__ __constant__ unsigned int n_layers_dc[1];
// extern __device__ __constant__ unsigned int num_detector_dc[1];
// extern __device__ __constant__ float start_weight_dc[1];
// extern __device__ __constant__ float detector_reflectance_dc[1];
// extern __device__ __constant__ LayerStruct layers_dc[PRESET_NUM_LAYER + 2];
// extern __device__ __constant__ DetectorInfoStruct detInfo_dc[PRESET_NUM_DETECTOR + 1];
// extern __device__ __constant__ float critical_angle_dc[PRESET_NUM_DETECTOR + 1];
// extern __device__ __constant__ bool source_probe_oblique_dc[1];
// extern __device__ __constant__ bool detector_probe_oblique_dc[1];
__device__ float rn_gen(curandState *s)
{
float x = curand_uniform(s);
return x;
}
void DoOneSimulation(SimulationStruct* simulation, int index, char* output, SimOptions simOpt, GPUInfo** GPUs)
{
SummaryStruct sumStruc;
sumStruc.time1 = clock(); // tic
sumStruc.number_of_photons = simulation->number_of_photons;
strncpy(sumStruc.GPU_name, (*GPUs)[simOpt.GPU_select_to_run].name, max_GPU_name_length);
vector<vector<curandState>> detected_state_arr(simulation->num_detector); // the state for detected photon in curand
int *total_SDS_detect_num = new int[simulation->num_detector]; // record number fo detected photon by each detector
float *reflectance = new float[simulation->num_detector]; //record reflectance of fibers
for (int d = 0; d < simulation->num_detector; d++) {
total_SDS_detect_num[d] = 0;
reflectance[d] = 0;
}
unsigned long long seed = time(NULL);
srand(seed); // set random seed for main loop
int num_of_threads_per_block;
int num_of_blocks;
int num_of_threads;
if (USE_PRESET_GPU)
{
num_of_blocks = NUM_BLOCKS;
num_of_threads_per_block = NUM_THREADS_PER_BLOCK;
num_of_threads = NUM_THREADS;
}
else
{
num_of_blocks = (*GPUs)[simOpt.GPU_select_to_run].autoblock*(*GPUs)[simOpt.GPU_select_to_run].MPs;
num_of_threads_per_block = (*GPUs)[simOpt.GPU_select_to_run].autothreadpb;
num_of_threads = num_of_blocks*num_of_threads_per_block;
}
dim3 dimBlock(num_of_threads_per_block); printf("NUM_THREADS_PER_BLOCK\t%d\n", num_of_threads_per_block);
dim3 dimGrid(num_of_blocks); printf("NUM_BLOCKS\t%d\n", num_of_blocks);
MemStruct DeviceMem;
MemStruct HostMem;
unsigned int threads_active_total = 1;
unsigned int i, ii;
cudaError_t cudastat;
InitMemStructs(&HostMem, &DeviceMem, simulation, num_of_threads);
InitDCMem(simulation);
LaunchPhoton_Global << <dimGrid, dimBlock >> >(DeviceMem, seed, num_of_threads_per_block);
cudaThreadSynchronize(); //CUDA_SAFE_CALL( cudaThreadSynchronize() ); // Wait for all threads to finish
cudastat = cudaGetLastError(); // Check if there was an error
if (cudastat)printf("Error code=%i, %s.\n", cudastat, cudaGetErrorString(cudastat));
i = 0;
// run the first time to find the photons can be detected
while (threads_active_total>0)
{
i++;
fiber_initialization(HostMem.f, num_of_threads); //Wang modified
//printf("Size of Fibers\t%d\n",sizeof(Fibers));
cudaMemcpy(DeviceMem.f, HostMem.f, num_of_threads * sizeof(Fibers), cudaMemcpyHostToDevice); //malloc sizeof(FIbers) equals to 13*(5*4)
//run the kernel
seed = rand(); // get seed for MCD
MCd <<<dimGrid, dimBlock >>>(DeviceMem, seed, num_of_threads, num_of_threads_per_block);
//cout << "after MCd\n";
cudaThreadSynchronize(); //CUDA_SAFE_CALL( cudaThreadSynchronize() ); // Wait for all threads to finish
cudastat = cudaGetLastError(); // Check if there was an error
if (cudastat)printf("Error code=%i, %s.\n", cudastat, cudaGetErrorString(cudastat));
// Copy thread_active from device to host, later deleted
cudaMemcpy(HostMem.thread_active, DeviceMem.thread_active, num_of_threads * sizeof(unsigned int), cudaMemcpyDeviceToHost); //CUDA_SAFE_CALL(cudaMemcpy(HostMem.thread_active,DeviceMem.thread_active,NUM_THREADS*sizeof(unsigned int),cudaMemcpyDeviceToHost) );
threads_active_total = 0;
for (ii = 0; ii<num_of_threads; ii++) threads_active_total += HostMem.thread_active[ii];
cudaMemcpy(HostMem.f, DeviceMem.f, num_of_threads * sizeof(Fibers), cudaMemcpyDeviceToHost); //CUDA_SAFE_CALL(cudaMemcpy(HostMem.f,DeviceMem.f,NUM_THREADS*sizeof(Fibers),cudaMemcpyDeviceToHost));
//cout << "before cal ref\n";
calculate_reflectance(HostMem.f, reflectance, total_SDS_detect_num, detected_state_arr, num_of_threads);
//cout << "after cal ref\n";
cudaMemcpy(HostMem.num_terminated_photons, DeviceMem.num_terminated_photons, sizeof(unsigned long long), cudaMemcpyDeviceToHost);
printf("\rRun %u, Number of photons terminated %llu, Threads active %u, photon deteced number for SDSs:", i, *HostMem.num_terminated_photons, threads_active_total);
for (int d = 0; d < simulation->num_detector; d++) {
printf("\t%d,", total_SDS_detect_num[d]);
}
printf(" ");
//printf("\n");
}
cudaMemcpy(HostMem.AA_rz, DeviceMem.AA_rz, record_nr * record_nz * sizeof(unsigned long long), cudaMemcpyDeviceToHost);
cudaMemcpy(HostMem.AA0_z, DeviceMem.AA0_z, record_nz * sizeof(unsigned long long), cudaMemcpyDeviceToHost);
cudaMemcpy(HostMem.R, DeviceMem.R, sizeof(unsigned long long), cudaMemcpyDeviceToHost);
if (simOpt.do_output_AA_arr) {
output_AA_rz(simulation, HostMem.AA_rz, simOpt.do_output_bin); // output the absorbance
output_AA0_z(simulation, HostMem.AA0_z, simOpt.do_output_bin);
}
if (simOpt.do_output_reflectance){
output_reflectance(simulation,HostMem.R, simOpt.do_output_bin);
}
sumStruc.time2 = clock(); // toc
printf("\nfinish first run, cost %.2f secs\n", (double)(sumStruc.time2 - sumStruc.time1) / CLOCKS_PER_SEC);
sumStruc.total_SDS_detect_num = new int[simulation->num_detector];
for (int s = 0; s < simulation->num_detector; s++) {
sumStruc.total_SDS_detect_num[s] = total_SDS_detect_num[s];
}
if (!simOpt.do_replay) { // only output the reflectance
output_fiber(simulation, reflectance, output);
output_sim_summary(simulation, sumStruc, simOpt.do_replay);
}
else { // replay the detected photons
// init the memstruct for replay
MemStruct_Replay HostMem_Replay;
MemStruct_Replay DeviceMem_Replay;
InitMemStructs_replay(&HostMem_Replay, &DeviceMem_Replay, simulation, num_of_threads);
//cout << "after init replay mem\n";
int *temp_SDS_detect_num = new int[simulation->num_detector]; // record temp number fo detected photon by the detector
float *replay_reflectance = new float[simulation->num_detector]; //record reflectance of fibers
//float pathlength_weight_arr[PRESET_NUM_DETECTOR][detected_size][NUM_LAYER + 2]
float*** pathlength_weight_arr = new float**[simulation->num_detector]; // record the pathlength and weight for each photon, in each layer, and for each detector, also scatter times
for (int s = 0; s < simulation->num_detector; s++) {
temp_SDS_detect_num[s] = 0;
replay_reflectance[s] = 0;
// init the PL array for this detector
pathlength_weight_arr[s] = new float*[total_SDS_detect_num[s]]; // record the pathlength and weight for each photon, in each layer, and for each detector, also scatter times
for (int j = 0; j < total_SDS_detect_num[s]; j++) {
pathlength_weight_arr[s][j] = new float[simulation->num_layers + 2];
for (int k = 0; k < simulation->num_layers + 2; k++) {
pathlength_weight_arr[s][j][k] = 0;
}
}
//printf("\tafter init PL arr for SDS %d\n", s);
// prepare seeds to copy to device
int replay_counter = 0;
while (replay_counter < total_SDS_detect_num[s]) {
int to_dev_index = 0;
for (int t = 0; t < num_of_threads; t++) {
HostMem.thread_active[t] = 0;
}
while (to_dev_index < num_of_threads && replay_counter < total_SDS_detect_num[s]) {
HostMem.state[to_dev_index] = detected_state_arr[s][replay_counter];
HostMem.thread_active[to_dev_index] = 1;
replay_counter++;
to_dev_index++;
}
cudaMemcpy(DeviceMem.state, HostMem.state, num_of_threads * sizeof(curandState), cudaMemcpyHostToDevice);
cudaMemcpy(DeviceMem.thread_active, HostMem.thread_active, num_of_threads * sizeof(unsigned int), cudaMemcpyHostToDevice);
//printf("\t\tafter copy seeds to device mem\n");
// init fibers
fiber_initialization(HostMem.f, num_of_threads);
cudaMemcpy(DeviceMem.f, HostMem.f, num_of_threads * sizeof(Fibers), cudaMemcpyHostToDevice);
fiber_initialization_replay(HostMem_Replay.f_r, simulation, num_of_threads);
cudaMemcpy(DeviceMem_Replay.f_r, HostMem_Replay.f_r, num_of_threads * sizeof(Fibers_Replay), cudaMemcpyHostToDevice);
//printf("\t\tafter init fibers\n");
// replay
MCd_replay << <dimGrid, dimBlock >> > (DeviceMem, DeviceMem_Replay, s, num_of_threads_per_block);
cudaThreadSynchronize(); // Wait for all threads to finish
cudastat = cudaGetLastError(); // Check if there was an error
if (cudastat)printf("Error code=%i, %s.\n", cudastat, cudaGetErrorString(cudastat));
//printf("\t\tafter replay\n");
// process result
cudaMemcpy(HostMem_Replay.f_r, DeviceMem_Replay.f_r, num_of_threads * sizeof(Fibers_Replay), cudaMemcpyDeviceToHost);
calculate_reflectance_replay(HostMem_Replay.f_r, replay_reflectance, pathlength_weight_arr, temp_SDS_detect_num, total_SDS_detect_num, s + 1, simulation->num_layers, num_of_threads);
//printf("\t\tafter cal reflectance\n");
}
}
int *backup_SDS_detect_num = new int[simulation->num_detector];
for (int z = 0; z < simulation->num_detector; z++) {
backup_SDS_detect_num[z] = temp_SDS_detect_num[z];
}
if (simOpt.output_each_pathlength) {
output_SDS_pathlength(simulation, pathlength_weight_arr, temp_SDS_detect_num, 0, simOpt.do_output_bin);
}
if (simOpt.do_output_average_pathlength) {
double *average_PL = new double[simulation->num_layers * simulation->num_detector];
calculate_average_pathlength(average_PL, pathlength_weight_arr, simulation, backup_SDS_detect_num);
output_average_pathlength(simulation, average_PL);
}
output_fiber(simulation, replay_reflectance, output);
// copy the A_rz and A0_z back to host
cudaMemcpy(HostMem_Replay.A_rz, DeviceMem_Replay.A_rz, simulation->num_detector * record_nr * record_nz * sizeof(unsigned long long), cudaMemcpyDeviceToHost);
cudaMemcpy(HostMem_Replay.A0_z, DeviceMem_Replay.A0_z, simulation->num_detector * record_nz * sizeof(unsigned long long), cudaMemcpyDeviceToHost);
if (simOpt.do_output_A_arr) {
output_A_rz(simulation, HostMem_Replay.A_rz, simOpt.do_output_bin); // output the absorbance
output_A0_z(simulation, HostMem_Replay.A0_z, simOpt.do_output_bin);
}
sumStruc.time3 = clock();
printf("finish replay, cost %.2f secs\n", (double)(sumStruc.time3 - sumStruc.time2) / CLOCKS_PER_SEC);
output_sim_summary(simulation, sumStruc, simOpt.do_replay);
// free the memory
FreeMemStructs_replay(&HostMem_Replay, &DeviceMem_Replay);
delete[] temp_SDS_detect_num;
delete[] replay_reflectance;
for (int i = 0; i < simulation->num_detector; i++) {
for (int j = 0; j < total_SDS_detect_num[i]; j++) {
delete[] pathlength_weight_arr[i][j];
}
delete[] pathlength_weight_arr[i];
}
delete[] pathlength_weight_arr;
}
// free the memory
FreeMemStructs(&HostMem, &DeviceMem);
delete[] total_SDS_detect_num;
delete[] reflectance;
}
void calculate_average_pathlength(double *average_PL ,float ***pathlength_weight_arr, SimulationStruct *sim, int *total_SDS_detect_num)
{
clock_t time1 = clock(); // tic
double inverse_start_weight = double(1/sim->start_weight);
double sum_photon_reflectance = 0;
double sum_photon_change_reflectnace = 0;
double photon_energy = 0;
for (int s = 0; s < sim->num_detector; s++) {
for (int l = 1; l <= sim->num_layers; l++) {
sum_photon_reflectance = 0;
sum_photon_change_reflectnace = 0;
double changed_mua = mua_change_ratio * sim->layers[l].mua;
for (int i = 0; i < total_SDS_detect_num[s]; i++) {
photon_energy = pathlength_weight_arr[s][i][0] * inverse_start_weight;
sum_photon_reflectance += photon_energy;
photon_energy *= exp(-1 * changed_mua * pathlength_weight_arr[s][i][l]);
sum_photon_change_reflectnace += photon_energy;
}
average_PL[s * sim->num_layers + l - 1] = log(sum_photon_reflectance / sum_photon_change_reflectnace) / changed_mua;
}
}
clock_t time2 = clock(); // toc
printf("finish average pathlength, cost %.2f secs\n", (double)(time2 - time1) / CLOCKS_PER_SEC);
}
void calculate_reflectance(Fibers* f, float *result, int* total_SDS_detect_num, vector<vector<curandState>>& detected_state_arr, int num_of_threads)
{
for (int i = 0; i < num_of_threads; i++)
{
// record the weight, count detected photon number, and record pathlength
for (int k = 0; k < f[i].detected_photon_counter; k++) {
int s = f[i].detected_SDS_number[k]; // the detecting SDS, start from 1
result[s - 1] += f[i].data[k];
total_SDS_detect_num[s - 1]++;
detected_state_arr[s - 1].push_back(f[i].detected_state[k]);
}
}
}
//void calculate_reflectance(Fibers* f, float *result, float (*pathlength_weight_arr)[NUM_LAYER + 1][detected_temp_size], int *total_SDS_detect_num, int *temp_SDS_detect_num)
//SDS_should_be: the detecting SDS, start from s=1 for SDS1
void calculate_reflectance_replay(Fibers_Replay* f_r, float *result, float ***pathlength_weight_arr, int *temp_SDS_detect_num, int *total_SDS_detect_num, int SDS_should_be, int num_layers, int num_of_threads)
{
for (int i = 0; i < num_of_threads; i++)
{
// record the weight, count detected photon number, and record pathlength
result[SDS_should_be -1] += f_r[i].data;
pathlength_weight_arr[SDS_should_be - 1][temp_SDS_detect_num[SDS_should_be - 1]][0] = f_r[i].data;
for (int l = 0; l < num_layers; l++) {
pathlength_weight_arr[SDS_should_be - 1][temp_SDS_detect_num[SDS_should_be - 1]][l + 1] = f_r[i].layer_pathlength[l];
}
pathlength_weight_arr[SDS_should_be - 1][temp_SDS_detect_num[SDS_should_be - 1]][num_layers + 1] = f_r[i].scatter_event;
temp_SDS_detect_num[SDS_should_be - 1]++;
if (temp_SDS_detect_num[SDS_should_be - 1] >= total_SDS_detect_num[SDS_should_be - 1]) { // if all the photons are replayed, then break
break;
}
}
}
//Device function to add an unsigned integer to an unsigned long long using CUDA Compute Capability 1.1
__device__ void AtomicAddULL(unsigned long long* address, unsigned int add)
{
if (atomicAdd((unsigned int*)address, add) + add<add)
atomicAdd(((unsigned int*)address) + 1, 1u);
}
__global__ void MCd(MemStruct DeviceMem, unsigned long long seed, int num_of_threads, int num_of_threads_per_block)
{
//Block index
int bx = blockIdx.x;
//Thread index
int tx = threadIdx.x;
//First element processed by the block
int begin = num_of_threads_per_block * bx;
float s; //step length
unsigned int w_temp;
PhotonStruct p = DeviceMem.p[begin + tx];
Fibers f = DeviceMem.f[begin + tx];
int new_layer;
curandState state = p.state_run; // get the state of curand from the photon
//First, make sure the thread (photon) is active
unsigned int ii = 0;
if (!DeviceMem.thread_active[begin + tx]) ii = NUMSTEPS_GPU;
unsigned int index, w, index_old;
index_old = 0;
w = 0;
bool k = true;
for (; ii<NUMSTEPS_GPU; ii++) //this is the main while loop
{
if (layers_dc[p.layer].mutr != FLT_MAX)
s = -__logf(rn_gen(&state))*layers_dc[p.layer].mutr;//sample step length [cm] //HERE AN OPEN_OPEN FUNCTION WOULD BE APPRECIATED
else
s = 100.0f;//temporary, say the step in glass is 100 cm.
//Check for layer transitions and in case, calculate s
new_layer = p.layer;
if (p.z + s*p.dz<layers_dc[p.layer].z_min) {
new_layer--;
s = __fdividef(layers_dc[p.layer].z_min - p.z, p.dz);
} //Check for upwards reflection/transmission & calculate new s
if (p.z + s*p.dz>layers_dc[p.layer].z_max) {
new_layer++;
s = __fdividef(layers_dc[p.layer].z_max - p.z, p.dz);
} //Check for downward reflection/transmission
p.x += p.dx*s;
p.y += p.dy*s;
p.z += p.dz*s;
p.scatter_event++;
if (p.z>layers_dc[p.layer].z_max)p.z = layers_dc[p.layer].z_max;//needed?
if (p.z<layers_dc[p.layer].z_min)p.z = layers_dc[p.layer].z_min;//needed?
if (new_layer != p.layer)
{
int temp_layer = p.layer;
// set the remaining step length to 0
s = 0.0f;
if (Reflect(&p, new_layer, &state) == 0u)//Check for reflection
{
if (new_layer == 0)
{ //Diffuse reflectance
bool detected = detect(&p, &f);
if (detected == 1) {
AtomicAddULL(&DeviceMem.R[0], p.weight);
p.weight = 0; // Set the remaining weight to 0, effectively killing the photon
}
else { // maybe the detector will reflect the photon
if (rn_gen(&state) > *detector_reflectance_dc) {
AtomicAddULL(&DeviceMem.R[0], p.weight);
p.weight = 0;
}
else // reflect into tissue
{
p.dz *= -1;
p.z *= -1;
p.layer = temp_layer;
}
}
}
if (new_layer > *n_layers_dc)
{ //Transmitted
p.weight = 0; // Set the remaining weight to 0, effectively killing the photon
}
}
}
if (s > 0.0f)
{
// Drop weight (apparently only when the photon is scattered)
w_temp = __float2uint_rn(layers_dc[p.layer].mua*layers_dc[p.layer].mutr*__uint2float_rn(p.weight));
//w_temp = layers_dc[p.layer].mua*layers_dc[p.layer].mutr*p.weight;
p.weight -= w_temp;
// store the absorbed power to grid
if (p.first_scatter) {
unsigned int index;
index = min(__float2int_rz(__fdividef(p.z, record_dz)), (int)(record_nz - 1));
AtomicAddULL(&DeviceMem.AA0_z[index], w_temp);
p.first_scatter = false;
}
else {
index = min(__float2int_rz(__fdividef(p.z, record_dz)), (int)(record_nz - 1)) *record_nr + min(__float2int_rz(__fdividef(sqrtf(p.x*p.x + p.y*p.y), record_dr)), (int)record_nr - 1);
if (index == index_old)
{
w += w_temp;
}
else// if(w!=0)
{
AtomicAddULL(&DeviceMem.AA_rz[index_old], w);
index_old = index;
w = w_temp;
}
}
Spin(&p, layers_dc[p.layer].g, &state);
}
if (!PhotonSurvive(&p, &state)) //if the photon doesn't survive
{
k = false;
if (atomicAdd(DeviceMem.num_terminated_photons, 1u) < (*num_photons_dc - num_of_threads))
{ // Ok to launch another photon
LaunchPhoton(&p, &state);//Launch a new photon
state = p.state_run; // reload the state of this thread
}
else
{ // No more photons should be launched.
AtomicAddULL(&DeviceMem.AA_rz[index_old], w);
DeviceMem.thread_active[begin + tx] = 0u; // Set thread to inactive
ii = NUMSTEPS_GPU; // Exit main loop
}
}
}//end main for loop!
if (w != 0)
AtomicAddULL(&DeviceMem.AA_rz[index_old], w);
p.state_run = state; // store the current curand state into the photon
//// truly need?
//if (k == true && DeviceMem.thread_active[begin + tx] == 1u) // photons are not killed after numerous steps
//{
// if (*DeviceMem.num_terminated_photons >= (*num_photons_dc - NUM_THREADS))
// DeviceMem.thread_active[begin + tx] = 0u;
//}
__syncthreads();//necessary?
//save the state of the MC simulation in global memory before exiting
DeviceMem.p[begin + tx] = p; //This one is incoherent!!!
DeviceMem.f[begin + tx] = f;
}//end MCd
__global__ void MCd_replay(MemStruct DeviceMem, MemStruct_Replay DeviceMem_Replay, int detected_SDS, int num_of_threads_per_block)
{
//Block index
int bx = blockIdx.x;
//Thread index
int tx = threadIdx.x;
//First element processed by the block
int begin = num_of_threads_per_block * bx;
//First, make sure the thread (photon) is active
if (DeviceMem.thread_active[begin + tx]) {
float s; //step length
unsigned int index, w, index_old;
index_old = 0;
w = 0;
unsigned int w_temp;
PhotonStruct p = DeviceMem.p[begin + tx];
Fibers f = DeviceMem.f[begin + tx];
Fibers_Replay f_r = DeviceMem_Replay.f_r[begin + tx];
// load the seed of detected photon and launch it
curandState state = DeviceMem.state[begin + tx];
LaunchPhoton(&p, &state);
state = p.state_run; // get the state of curand from the photon
int new_layer;
bool k = true;
while (true) //this is the main while loop
{
if (layers_dc[p.layer].mutr != FLT_MAX)
s = -__logf(rn_gen(&state))*layers_dc[p.layer].mutr;//sample step length [cm] //HERE AN OPEN_OPEN FUNCTION WOULD BE APPRECIATED
else
s = 100.0f;//temporary, say the step in glass is 100 cm.
//Check for layer transitions and in case, calculate s
new_layer = p.layer;
if (p.z + s*p.dz < layers_dc[p.layer].z_min) {
new_layer--;
s = __fdividef(layers_dc[p.layer].z_min - p.z, p.dz);
} //Check for upwards reflection/transmission & calculate new s
if (p.z + s*p.dz > layers_dc[p.layer].z_max) {
new_layer++;
s = __fdividef(layers_dc[p.layer].z_max - p.z, p.dz);
} //Check for downward reflection/transmission
p.x += p.dx*s;
p.y += p.dy*s;
p.z += p.dz*s;
f_r.scatter_event++;
f_r.layer_pathlength[p.layer - 1] += s;
if (p.z > layers_dc[p.layer].z_max)p.z = layers_dc[p.layer].z_max;//needed?
if (p.z < layers_dc[p.layer].z_min)p.z = layers_dc[p.layer].z_min;//needed?
if (new_layer != p.layer)
{
int temp_layer = p.layer;
// set the remaining step length to 0
s = 0.0f;
if (Reflect(&p, new_layer, &state) == 0u)//Check for reflection
{
if (new_layer == 0)
{ //Diffuse reflectance
bool detected = detect_replay(&p, &f, detected_SDS);
if (detected) { // store the photon information into f_r
f_r.have_detected = true;
f_r.data = f.data[0];
f_r.detected_SDS_number = f.detected_SDS_number[0];
p.weight = 0; // Set the remaining weight to 0, effectively killing the photon
break;
}
else { // maybe the detector will reflect the photon
if (rn_gen(&state) > *detector_reflectance_dc) {
f_r.have_detected = true;
f_r.data = f.data[0];
f_r.detected_SDS_number = f.detected_SDS_number[0];
p.weight = 0;
break;
}
else
{
p.dz *= -1;
p.z *= -1;
p.layer = temp_layer;
}
}
}
if (new_layer > *n_layers_dc)
{ //Transmitted
p.weight = 0; // Set the remaining weight to 0, effectively killing the photon
break;
}
}
}
if (s > 0.0f)
{
// Drop weight (apparently only when the photon is scattered)
w_temp = __float2uint_rn(layers_dc[p.layer].mua*layers_dc[p.layer].mutr*__uint2float_rn(p.weight));
//w_temp = layers_dc[p.layer].mua*layers_dc[p.layer].mutr*p.weight;
p.weight -= w_temp;
// store the absorbed power to grid
if (p.first_scatter) {
unsigned int index;
index = detected_SDS*record_nz + min(__float2int_rz(__fdividef(p.z, record_dz)), (int)(record_nz - 1));
AtomicAddULL(&DeviceMem_Replay.A0_z[index], w_temp);
p.first_scatter = false;
}
else {
index = detected_SDS*record_nr*record_nz + min(__float2int_rz(__fdividef(p.z, record_dz)), (int)(record_nz - 1)) *record_nr + min(__float2int_rz(__fdividef(sqrtf(p.x*p.x + p.y*p.y), record_dr)), (int)record_nr - 1);
if (index == index_old)
{
w += w_temp;
}
else// if(w!=0)
{
AtomicAddULL(&DeviceMem_Replay.A_rz[index_old], w);
index_old = index;
w = w_temp;
}
}
Spin(&p, layers_dc[p.layer].g, &state);
}
if (!PhotonSurvive(&p, &state)) //if the photon doesn't survive
{
break;
}
}//end main for loop!
if (w != 0)
AtomicAddULL(&DeviceMem_Replay.A_rz[index_old], w);
p.state_run = state; // store the current rand state
//save the state of the MC simulation in global memory before exiting
DeviceMem.p[begin + tx] = p; //This one is incoherent!!!
DeviceMem.f[begin + tx] = f;
DeviceMem_Replay.f_r[begin + tx] = f_r;
}
}//end MCd
__device__ void LaunchPhoton(PhotonStruct* p, curandState *state)
{
p->state_seed = *state; // store the init curandState of photon
/*
float rnd_position, rnd_Azimuth, rnd_direction, rnd_rotated;
float AzimuthAngle;
float launchPosition;
float theta_direction;
float rotated_angle;
float uxprime, uyprime, uzprime;
float angle = -ANGLE * PI / 180;
*/
//rnd_position = rn_gen(state);
//rnd_Azimuth = rn_gen(state);
//rnd_direction = rn_gen(state);
//rnd_rotated = rn_gen(state);
//AzimuthAngle = 2 * PI * rnd_Azimuth;
//rotated_angle = 2 * PI * rnd_rotated; //YU-modified // modified to point source
//float beam_width = 0.0175; // 175 um, Gaussian beam profile
//float beam_width = illumination_r;
// infinite narrow beam for impulse response
//theta_direction = asin(NAOfSource / n_source)*rnd_direction;
//p->dz = cos(theta_direction);
//p->dx = sin(theta_direction) * cos(rotated_angle);
//p->dy = sin(theta_direction) * sin(rotated_angle);
//uxprime = cos(angle)*p->dx - sin(angle)*p->dz;
//uyprime = sin(theta_direction)*sin(rotated_angle);
//uzprime = sin(angle)*p->dx + cos(angle)*p->dz; // YU-modified
//p->dx = uxprime, p->dy = uyprime, p->dz = uzprime;
if (*source_probe_oblique_dc)
{
p->x = 0.0;
p->y = 0.0;
p->z = 0.0;
// the angle is toward positive x-axis
p->dx = 1.0*sin(detInfo_dc[0].angle);
p->dy = 0.0;
p->dz = 1.0*cos(detInfo_dc[0].angle);
}
else // if source_probe_oblique_dc == false
{
if (detInfo_dc[0].raduis == 0) { // for point source
p->x = 0.0;
p->y = 0.0;
p->z = 0.0;
}
else {
float rnd_position, rnd_Azimuth;
float launchPosition;
float AzimuthAngle;
rnd_position = rn_gen(state);
rnd_Azimuth = rn_gen(state);
launchPosition = detInfo_dc[0].raduis*rnd_position;
AzimuthAngle = 2 * PI * rnd_Azimuth;
p->x = launchPosition*sin(AzimuthAngle);
p->y = launchPosition*cos(AzimuthAngle);
p->z = 0;
}
if (detInfo_dc[0].NA == 0) { // for pencil beam
p->dx = 0.0, p->dy = 0.0, p->dz = 1.0;
}
else { // for NA source
float rnd_direction, rnd_rotated;
float theta_direction; // direction angle from +z
float rotated_angle; // the rotate of direction
rnd_direction = rn_gen(state);
rnd_rotated = rn_gen(state);
theta_direction = asin(detInfo_dc[0].NA / layers_dc[0].n)*rnd_direction;
rotated_angle = 2 * PI * rnd_rotated;
p->dz = cos(theta_direction);
p->dx = sin(theta_direction) * cos(rotated_angle);
p->dy = sin(theta_direction) * sin(rotated_angle);
}
}
p->layer = 1;
p->first_scatter = true;
p->scatter_event = 0;
p->weight = *start_weight_dc; //specular reflection!
p->state_run = *state; // store the current state of photon after serveral rand
}
__global__ void LaunchPhoton_Global(MemStruct DeviceMem, unsigned long long seed, int num_of_threads_per_block)
{
int bx = blockIdx.x;
int tx = threadIdx.x;
//First element processed by the block
int begin = num_of_threads_per_block*bx;
PhotonStruct p;
curandState state = DeviceMem.state[begin + tx];
curand_init(seed, begin+tx, 0, &state); // init curandState for each photon
LaunchPhoton(&p, &state);
//__syncthreads();//necessary?
DeviceMem.p[begin + tx] = p;//incoherent!?
}
__device__ void Spin(PhotonStruct* p, float g, curandState *state)
{
float theta, cost, sint; // cosine and sine of the polar deflection angle theta.
float cosp, sinp; // cosine and sine of the azimuthal angle psi.
float temp;
if (g == 0.0f)
{
cost = 2.0f*rn_gen(state) - 1.0f;
}
else // This is more efficient for g!=0 but of course less efficient for g==0
{
// The Henyey-Greenstein phase function
temp = __fdividef((1.0f - (g)*(g)), (1.0f - (g)+2.0f*(g)*rn_gen(state)));
cost = __fdividef((1.0f + (g)*(g)-temp*temp), (2.0f*(g)));
}
sint = sqrtf(1.0f - cost*cost);
__sincosf(2.0f*PI*rn_gen(state), &sinp, &cosp); // spin psi [0-2*PI)
temp = sqrtf(1.0f - p->dz*p->dz);
if (temp == 0.0f) //normal incident.
{
p->dx = sint*cosp;
p->dy = sint*sinp;
p->dz = copysignf(cost, p->dz*cost);
}
else // regular incident.
{
float cxp, cyp;
cxp = __fdividef(sint*(p->dx*p->dz*cosp - p->dy*sinp), temp) + p->dx*cost;
cyp = __fdividef(sint*(p->dy*p->dz*cosp + p->dx*sinp), temp) + p->dy*cost;
p->dx = cxp;
p->dy = cyp;
p->dz = -sint*cosp*temp + p->dz*cost;
}
//normalisation seems to be required as we are using floats! Otherwise the small numerical error will accumulate
temp = rsqrtf(p->dx*p->dx + p->dy*p->dy + p->dz*p->dz);
p->dx = p->dx*temp;
p->dy = p->dy*temp;
p->dz = p->dz*temp;
}// end Spin
__device__ unsigned int Reflect(PhotonStruct* p, int new_layer, curandState *state)
{
//Calculates whether the photon is reflected (returns 1) or not (returns 0)
// Reflect() will also update the current photon layer (after transmission) and photon direction (both transmission and reflection)
float n1 = layers_dc[p->layer].n;
float n2 = layers_dc[new_layer].n;
float r;
float cos_angle_i = fabsf(p->dz);
if (n1 == n2)//refraction index matching automatic transmission and no direction change
{
p->layer = new_layer;
return 0u;
}
if (n1>n2 && n2*n2<n1*n1*(1 - cos_angle_i*cos_angle_i))//total internal reflection, no layer change but z-direction mirroring
{
p->dz *= -1.0f;
return 1u;
}
if (cos_angle_i == 1.0f)//normal incident
{
r = __fdividef((n1 - n2), (n1 + n2));
if (rn_gen(state) <= r*r)
{
//reflection, no layer change but z-direction mirroring
p->dz *= -1.0f;
return 1u;
}
else
{ //transmission, no direction change but layer change
p->layer = new_layer;
return 0u;
}
}
//gives almost exactly the same results as the old MCML way of doing the calculation but does it slightly faster
// save a few multiplications, calculate cos_angle_i^2;
float e = __fdividef(n1*n1, n2*n2)*(1.0f - cos_angle_i*cos_angle_i); //e is the sin square of the transmission angle
r = 2 * sqrtf((1.0f - cos_angle_i*cos_angle_i)*(1.0f - e)*e*cos_angle_i*cos_angle_i);//use r as a temporary variable
e = e + (cos_angle_i*cos_angle_i)*(1.0f - 2.0f*e);//Update the value of e
r = e*__fdividef((1.0f - e - r), ((1.0f - e + r)*(e + r)));//Calculate r
if (rn_gen(state) <= r)
{
// Reflection, mirror z-direction!
p->dz *= -1.0f;
return 1u;
}
else
{
// Transmission, update layer and direction
r = __fdividef(n1, n2);
e = r*r*(1.0f - cos_angle_i*cos_angle_i); //e is the sin square of the transmission angle
p->dx *= r;
p->dy *= r;
p->dz = copysignf(sqrtf(1 - e), p->dz);
p->layer = new_layer;
return 0u;
}
}
__device__ unsigned int PhotonSurvive(PhotonStruct* p, curandState *state)
{
//Calculate wether the photon survives (returns 1) or dies (returns 0)
if (p->scatter_event >= max_scatter_time) return 0; // scatter too many times, terminate the photon
if (p->weight>WEIGHTI) return 1u; // No roulette needed
if (p->weight == 0u) return 0u; // Photon has exited slab, i.e. kill the photon
if (rn_gen(state) < CHANCE)
{
p->weight = __float2uint_rn(__fdividef((float)p->weight, CHANCE));
//p->weight = __fdividef((float)p->weight,CHANCE);
return 1u;
}
return 0u;
}
__device__ bool detect(PhotonStruct* p, Fibers* f)
{
bool detected_flag = false;
if (*detector_probe_oblique_dc)
{
for (int i = 1; i <= *num_detector_dc; i++)
{
// check if the photon is in the range of detector
// because the detector is oblique, so the detection area is oval
// use fober detector rathter than ring detector, and each detector locate at (x,y) = (pos,0)
//if ((pow((p->x - detInfo_dc[i].position)*cos(detInfo_dc[i].angle), 2) + pow(p->y, 2)) <= pow(detInfo_dc[i].raduis, 2))
if (((p->x - detInfo_dc[i].position)*cos(detInfo_dc[i].angle)*(p->x - detInfo_dc[i].position)*cos(detInfo_dc[i].angle))+((p->y)*(p->y)) <= detInfo_dc[i].raduis*detInfo_dc[i].raduis )
{
// let the detector point to the -x direction
float uz_rotated = (p->dx*sin(detInfo_dc[i].angle)) + fabs(p->dz*cos(detInfo_dc[i].angle));
float uz_angle = acos(fabs(uz_rotated));
if (uz_angle <= critical_angle_dc[i]) // in the range of detector NA
{
if (f->detected_photon_counter < SDS_detected_temp_size) {
f->detected_SDS_number[f->detected_photon_counter] = i;
f->data[f->detected_photon_counter] = p->weight;
f->detected_state[f->detected_photon_counter] = p->state_seed;
f->detected_photon_counter++;
}
detected_flag = true;
}
}
}
}
else // detector_probe_oblique_dc == false
{
float uz_angle = acos(fabs(p->dz)); //YU-modified
float distance;
distance = sqrt(p->x * p->x + p->y * p->y);
for (int i = 1; i <= *num_detector_dc; i++)
{
if (uz_angle <= critical_angle_dc[i]) { // successfully detected
if ((distance >= (detInfo_dc[i].position - detInfo_dc[i].raduis)) && (distance <= (detInfo_dc[i].position + detInfo_dc[i].raduis))) {
// calculate the weight detected by ring detector into a fiber detector
float temp;
temp = (distance*distance + detInfo_dc[i].position * detInfo_dc[i].position - detInfo_dc[i].raduis * detInfo_dc[i].raduis) / (2 * distance * detInfo_dc[i].position);
// check for rounding error!
if (temp > 1.0f)
temp = 1.0f;
if (f->detected_photon_counter < SDS_detected_temp_size) {
f->detected_SDS_number[f->detected_photon_counter] = i;
f->data[f->detected_photon_counter] = p->weight * acos(temp) * RPI;