My Project
FPGASolverBackend.hpp
1 /*
2  Copyright 2020 Equinor ASA
3 
4  This file is part of the Open Porous Media project (OPM).
5 
6  OPM is free software: you can redistribute it and/or modify
7  it under the terms of the GNU General Public License as published by
8  the Free Software Foundation, either version 3 of the License, or
9  (at your option) any later version.
10 
11  OPM is distributed in the hope that it will be useful,
12  but WITHOUT ANY WARRANTY; without even the implied warranty of
13  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14  GNU General Public License for more details.
15 
16  You should have received a copy of the GNU General Public License
17  along with OPM. If not, see <http://www.gnu.org/licenses/>.
18 */
19 
20 #ifndef OPM_FPGASOLVER_BACKEND_HEADER_INCLUDED
21 #define OPM_FPGASOLVER_BACKEND_HEADER_INCLUDED
22 
23 #include <opm/simulators/linalg/bda/BdaSolver.hpp>
24 #include <opm/simulators/linalg/bda/FPGABILU0.hpp>
25 
26 #include <linearalgebra/ilu0bicgstab/xilinx/src/sda_app/bicgstab_solver_config.hpp>
27 #include <linearalgebra/ilu0bicgstab/xilinx/src/sda_app/common/opencl_lib.hpp>
28 #include <linearalgebra/ilu0bicgstab/xilinx/src/sda_app/common/fpga_functions_bicgstab.hpp>
29 
30 namespace bda
31 {
32 
34 template <unsigned int block_size>
35 class FpgaSolverBackend : public BdaSolver<block_size>
36 {
39 
40  using Base::N;
41  using Base::Nb;
42  using Base::nnz;
43  using Base::nnzb;
44  using Base::verbosity;
45  using Base::maxit;
46  using Base::tolerance;
47  using Base::initialized;
48 
49 private:
50  double *rx = nullptr; // reordered x
51  double *rb = nullptr; // reordered b
52  int *fromOrder = nullptr, *toOrder = nullptr;
53  bool analysis_done = false;
54  bool level_scheduling = false;
55 
56  // LUMat will shallow copy rowPointers and colIndices of mat/rMat
57  std::unique_ptr<BlockedMatrix<block_size> > mat = nullptr;
58  BlockedMatrix<block_size> *rMat = nullptr;
59  std::unique_ptr<Preconditioner> prec = nullptr;
60 
61  // vectors with data processed by the preconditioner (input to the kernel)
62  void **processedPointers = nullptr;
63  int *processedSizes = nullptr;
64 
65  unsigned int fpga_calls = 0;
66  bool perf_call_enabled = true;
67 
68  // per call performance metrics
69  typedef struct {
70  double s_preconditioner_create = 0.0;
71  double s_analysis = 0.0;
72  double s_reorder = 0.0;
73  double s_mem_setup = 0.0;
74  double s_mem_h2d = 0.0;
75  double s_kernel_exec = 0.0;
76  unsigned int n_kernel_exec_cycles = 0;
77  float n_kernel_exec_iters = 0.0;
78  double s_mem_d2h = 0.0;
79  double s_solve = 0.0;
80  double s_postprocess = 0.0;
81  bool converged = false;
82  unsigned int converged_flags = 0;
83  } perf_call_metrics_t;
84  // cumulative performance metrics
85  typedef struct {
86  double s_initialization;
87  double s_preconditioner_setup;
88  double s_preconditioner_create;
89  double s_preconditioner_create_min,s_preconditioner_create_max,s_preconditioner_create_avg;
90  double s_analysis;
91  double s_analysis_min,s_analysis_max,s_analysis_avg;
92  double s_reorder;
93  double s_reorder_min,s_reorder_max,s_reorder_avg;
94  double s_mem_setup;
95  double s_mem_setup_min,s_mem_setup_max,s_mem_setup_avg;
96  double s_mem_h2d;
97  double s_mem_h2d_min,s_mem_h2d_max,s_mem_h2d_avg;
98  double s_kernel_exec;
99  double s_kernel_exec_min,s_kernel_exec_max,s_kernel_exec_avg;
100  unsigned long n_kernel_exec_cycles;
101  unsigned long n_kernel_exec_cycles_min,n_kernel_exec_cycles_max,n_kernel_exec_cycles_avg;
102  float n_kernel_exec_iters;
103  float n_kernel_exec_iters_min,n_kernel_exec_iters_max,n_kernel_exec_iters_avg;
104  double s_mem_d2h;
105  double s_mem_d2h_min,s_mem_d2h_max,s_mem_d2h_avg;
106  double s_solve;
107  double s_solve_min,s_solve_max,s_solve_avg;
108  double s_postprocess;
109  double s_postprocess_min,s_postprocess_max,s_postprocess_avg;
110  unsigned int n_converged;
111  } perf_total_metrics_t;
112  std::vector<perf_call_metrics_t> perf_call;
113  perf_total_metrics_t perf_total;
114  // fpga_config_bits: bit0=do_reset_debug: if 1, will reset debug flags at each state change, otherwise flags are sticky
115  // fpga_config_bits: bit1=absolute_compare: if 1, will compare norm with provided precision value, otherwise it's incremental
116  unsigned int fpga_config_bits = 0;
117  bool fpga_disabled = false;
118  bool platform_awsf1;
119  unsigned int debugbufferSize;
120  unsigned long int *debugBuffer = nullptr;
121  unsigned int *databufferSize = nullptr;
122  unsigned char *dataBuffer[RW_BUF] = {nullptr};
123  unsigned int debug_outbuf_words;
124  int resultsNum;
125  int resultsBufferNum;
126  unsigned int resultsBufferSize[RES_BUF_MAX];
127  unsigned int result_offsets[6];
128  unsigned int kernel_cycles, kernel_iter_run;
129  double norms[4];
130  unsigned char last_norm_idx;
131  bool kernel_aborted, kernel_signature, kernel_overflow;
132  bool kernel_noresults;
133  bool kernel_wrafterend, kernel_dbgfifofull;
134  bool use_residuals = false;
135  bool use_LU_res = false;
136  int sequence = 0;
137  // TODO: these values may be sent via command line parameters
138  unsigned int abort_cycles = 2000000000; // 2x10^9 @ 300MHz is around 6.6 s
139  unsigned int debug_sample_rate = 65535; // max value allowed is 65535, 0 means disabled; reduce to get a finer debug dump
140  int nnzValArrays_size = 0;
141  int L_nnzValArrays_size = 0;
142  int U_nnzValArrays_size = 0;
143  // aliases to areas of the host data buffers
144  long unsigned int *setupArray = nullptr;
145  double **nnzValArrays = nullptr;
146  short unsigned int *columnIndexArray = nullptr;
147  unsigned char *newRowOffsetArray = nullptr;
148  unsigned int *PIndexArray = nullptr;
149  unsigned int *colorSizesArray = nullptr;
150  double **L_nnzValArrays = nullptr;
151  short unsigned int *L_columnIndexArray = nullptr;
152  unsigned char *L_newRowOffsetArray = nullptr;
153  unsigned int *L_PIndexArray = nullptr;
154  unsigned int *L_colorSizesArray = nullptr;
155  double **U_nnzValArrays = nullptr;
156  short unsigned int *U_columnIndexArray = nullptr;
157  unsigned char *U_newRowOffsetArray = nullptr;
158  unsigned int *U_PIndexArray = nullptr;
159  unsigned int *U_colorSizesArray = nullptr;
160  double *BLKDArray = nullptr;
161  double *X1Array = nullptr, *X2Array = nullptr;
162  double *R1Array = nullptr, *R2Array = nullptr;
163  double *LresArray = nullptr, *UresArray = nullptr;
164  double *resultsBuffer[RES_BUF_MAX] = {nullptr}; // alias for data output region
165  // OpenCL variables
166  cl_device_id device_id;
167  cl_context context;
168  cl_command_queue commands;
169  cl_program program;
170  cl_kernel kernel;
171  cl_mem cldata[RW_BUF] = {nullptr};
172  cl_mem cldebug = nullptr;
173  // HW limits/configuration variables
174  unsigned int hw_x_vector_elem;
175  unsigned int hw_max_row_size;
176  unsigned int hw_max_column_size;
177  unsigned int hw_max_colors_size;
178  unsigned short hw_max_nnzs_per_row;
179  unsigned int hw_max_matrix_size;
180  bool hw_use_uram;
181  bool hw_write_ilu0_results;
182  unsigned short hw_dma_data_width;
183  unsigned char hw_x_vector_latency;
184  unsigned char hw_add_latency;
185  unsigned char hw_mult_latency;
186  unsigned char hw_mult_num;
187  unsigned char hw_num_read_ports;
188  unsigned char hw_num_write_ports;
189  unsigned short hw_reset_cycles;
190  unsigned short hw_reset_settle;
191  // debug
192  bool reset_data_buffers = false;
193  bool fill_results_buffers = false;
194  int dump_data_buffers = 0; // 0=disabled, 1=binary format, 2=text format
195  bool dump_results = false;
196  char *data_dir = nullptr;
197  char *basename = nullptr;
198  unsigned short rst_assert_cycles = 0;
199  unsigned short rst_settle_cycles = 0;
200 
208  void initialize(int N, int nnz, int dim, double *vals, int *rows, int *cols);
209 
213  void update_system(double *vals, double *b);
214 
217  bool analyse_matrix();
218 
221  bool create_preconditioner();
222 
225  void solve_system(BdaResult &res);
226 
228  void generate_statistics(void);
229 
230 public:
231 
238  FpgaSolverBackend(std::string fpga_bitstream, int linear_solver_verbosity, int maxit, double tolerance, ILUReorder opencl_ilu_reorder);
239 
242 
254  SolverStatus solve_system(int N, int nnz, int dim, double *vals, int *rows, int *cols, double *b, WellContributions& wellContribs, BdaResult &res) override;
255 
258  void get_result(double *x) override;
259 
260 }; // end class fpgaSolverBackend
261 
262 } //namespace bda
263 
264 #endif
265 
This class serves to eliminate the need to include the WellContributions into the matrix (with –matri...
Definition: WellContributions.hpp:61
This class is based on InverseOperatorResult struct from dune/istl/solver.hh It is needed to prevent ...
Definition: BdaResult.hpp:29
This class serves to simplify choosing between different backend solvers, such as cusparseSolver and ...
Definition: BdaSolver.hpp:43
This struct resembles a blocked csr matrix, like Dune::BCRSMatrix.
Definition: BlockedMatrix.hpp:36
Definition: FPGABILU0.hpp:39
This class implements an ilu0-bicgstab solver on FPGA.
Definition: FPGASolverBackend.hpp:36
FpgaSolverBackend(std::string fpga_bitstream, int linear_solver_verbosity, int maxit, double tolerance, ILUReorder opencl_ilu_reorder)
Construct an fpgaSolver.
Definition: FPGASolverBackend.cpp:48
void get_result(double *x) override
Get result after linear solve, and peform postprocessing if necessary.
Definition: FPGASolverBackend.cpp:208
~FpgaSolverBackend()
Destroy an fpgaSolver, and free memory.
Definition: FPGASolverBackend.cpp:174