// Ceres Solver - A fast non-linear least squares minimizer // Copyright 2019 Google Inc. All rights reserved. // http://ceres-solver.org/ // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are met: // // * Redistributions of source code must retain the above copyright notice, // this list of conditions and the following disclaimer. // * Redistributions in binary form must reproduce the above copyright notice, // this list of conditions and the following disclaimer in the documentation // and/or other materials provided with the distribution. // * Neither the name of Google Inc. nor the names of its contributors may be // used to endorse or promote products derived from this software without // specific prior written permission. // // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE // ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE // LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR // CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF // SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS // INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN // CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) // ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE // POSSIBILITY OF SUCH DAMAGE. // // Author: sameeragarwal@google.com (Sameer Agarwal) #ifndef CERES_INTERNAL_SCHUR_ELIMINATOR_H_ #define CERES_INTERNAL_SCHUR_ELIMINATOR_H_ #include #include #include #include #include "Eigen/Dense" #include "ceres/block_random_access_matrix.h" #include "ceres/block_sparse_matrix.h" #include "ceres/block_structure.h" #include "ceres/internal/config.h" #include "ceres/internal/disable_warnings.h" #include "ceres/internal/eigen.h" #include "ceres/internal/export.h" #include "ceres/linear_solver.h" namespace ceres::internal { // Classes implementing the SchurEliminatorBase interface implement // variable elimination for linear least squares problems. Assuming // that the input linear system Ax = b can be partitioned into // // E y + F z = b // // Where x = [y;z] is a partition of the variables. The partitioning // of the variables is such that, E'E is a block diagonal matrix. Or // in other words, the parameter blocks in E form an independent set // of the graph implied by the block matrix A'A. Then, this class // provides the functionality to compute the Schur complement system // // S z = r // // where // // S = F'F - F'E (E'E)^{-1} E'F and r = F'b - F'E(E'E)^(-1) E'b // // This is the Eliminate operation, i.e., construct the linear system // obtained by eliminating the variables in E. // // The eliminator also provides the reverse functionality, i.e. given // values for z it can back substitute for the values of y, by solving the // linear system // // Ey = b - F z // // which is done by observing that // // y = (E'E)^(-1) [E'b - E'F z] // // The eliminator has a number of requirements. // // The rows of A are ordered so that for every variable block in y, // all the rows containing that variable block occur as a vertically // contiguous block. i.e the matrix A looks like // // E F chunk // A = [ y1 0 0 0 | z1 0 0 0 z5] 1 // [ y1 0 0 0 | z1 z2 0 0 0] 1 // [ 0 y2 0 0 | 0 0 z3 0 0] 2 // [ 0 0 y3 0 | z1 z2 z3 z4 z5] 3 // [ 0 0 y3 0 | z1 0 0 0 z5] 3 // [ 0 0 0 y4 | 0 0 0 0 z5] 4 // [ 0 0 0 y4 | 0 z2 0 0 0] 4 // [ 0 0 0 y4 | 0 0 0 0 0] 4 // [ 0 0 0 0 | z1 0 0 0 0] non chunk blocks // [ 0 0 0 0 | 0 0 z3 z4 z5] non chunk blocks // // This structure should be reflected in the corresponding // CompressedRowBlockStructure object associated with A. The linear // system Ax = b should either be well posed or the array D below // should be non-null and the diagonal matrix corresponding to it // should be non-singular. For simplicity of exposition only the case // with a null D is described. // // The usual way to do the elimination is as follows. Starting with // // E y + F z = b // // we can form the normal equations, // // E'E y + E'F z = E'b // F'E y + F'F z = F'b // // multiplying both sides of the first equation by (E'E)^(-1) and then // by F'E we get // // F'E y + F'E (E'E)^(-1) E'F z = F'E (E'E)^(-1) E'b // F'E y + F'F z = F'b // // now subtracting the two equations we get // // [FF' - F'E (E'E)^(-1) E'F] z = F'b - F'E(E'E)^(-1) E'b // // Instead of forming the normal equations and operating on them as // general sparse matrices, the algorithm here deals with one // parameter block in y at a time. The rows corresponding to a single // parameter block yi are known as a chunk, and the algorithm operates // on one chunk at a time. The mathematics remains the same since the // reduced linear system can be shown to be the sum of the reduced // linear systems for each chunk. This can be seen by observing two // things. // // 1. E'E is a block diagonal matrix. // // 2. When E'F is computed, only the terms within a single chunk // interact, i.e for y1 column blocks when transposed and multiplied // with F, the only non-zero contribution comes from the blocks in // chunk1. // // Thus, the reduced linear system // // FF' - F'E (E'E)^(-1) E'F // // can be re-written as // // sum_k F_k F_k' - F_k'E_k (E_k'E_k)^(-1) E_k' F_k // // Where the sum is over chunks and E_k'E_k is dense matrix of size y1 // x y1. // // Advanced usage. Until now it has been assumed that the user would // be interested in all of the Schur Complement S. However, it is also // possible to use this eliminator to obtain an arbitrary submatrix of // the full Schur complement. When the eliminator is generating the // blocks of S, it asks the RandomAccessBlockMatrix instance passed to // it if it has storage for that block. If it does, the eliminator // computes/updates it, if not it is skipped. This is useful when one // is interested in constructing a preconditioner based on the Schur // Complement, e.g., computing the block diagonal of S so that it can // be used as a preconditioner for an Iterative Substructuring based // solver [See Agarwal et al, Bundle Adjustment in the Large, ECCV // 2008 for an example of such use]. // // Example usage: Please see schur_complement_solver.cc class CERES_NO_EXPORT SchurEliminatorBase { public: virtual ~SchurEliminatorBase(); // Initialize the eliminator. It is the user's responsibility to call // this function before calling Eliminate or BackSubstitute. It is // also the caller's responsibility to ensure that the // CompressedRowBlockStructure object passed to this method is the // same one (or is equivalent to) the one associated with the // BlockSparseMatrix objects below. // // assume_full_rank_ete controls how the eliminator inverts with the // diagonal blocks corresponding to e blocks in A'A. If // assume_full_rank_ete is true, then a Cholesky factorization is // used to compute the inverse, otherwise a singular value // decomposition is used to compute the pseudo inverse. virtual void Init(int num_eliminate_blocks, bool assume_full_rank_ete, const CompressedRowBlockStructure* bs) = 0; // Compute the Schur complement system from the augmented linear // least squares problem [A;D] x = [b;0]. The left hand side and the // right hand side of the reduced linear system are returned in lhs // and rhs respectively. // // It is the caller's responsibility to construct and initialize // lhs. Depending upon the structure of the lhs object passed here, // the full or a submatrix of the Schur complement will be computed. // // Since the Schur complement is a symmetric matrix, only the upper // triangular part of the Schur complement is computed. virtual void Eliminate(const BlockSparseMatrixData& A, const double* b, const double* D, BlockRandomAccessMatrix* lhs, double* rhs) = 0; // Given values for the variables z in the F block of A, solve for // the optimal values of the variables y corresponding to the E // block in A. virtual void BackSubstitute(const BlockSparseMatrixData& A, const double* b, const double* D, const double* z, double* y) = 0; // Factory static std::unique_ptr Create( const LinearSolver::Options& options); }; // Templated implementation of the SchurEliminatorBase interface. The // templating is on the sizes of the row, e and f blocks sizes in the // input matrix. In many problems, the sizes of one or more of these // blocks are constant, in that case, its worth passing these // parameters as template arguments so that they are visible to the // compiler and can be used for compile time optimization of the low // level linear algebra routines. template class CERES_NO_EXPORT SchurEliminator final : public SchurEliminatorBase { public: explicit SchurEliminator(const LinearSolver::Options& options) : num_threads_(options.num_threads), context_(options.context) { CHECK(context_ != nullptr); } // SchurEliminatorBase Interface ~SchurEliminator() override; void Init(int num_eliminate_blocks, bool assume_full_rank_ete, const CompressedRowBlockStructure* bs) final; void Eliminate(const BlockSparseMatrixData& A, const double* b, const double* D, BlockRandomAccessMatrix* lhs, double* rhs) final; void BackSubstitute(const BlockSparseMatrixData& A, const double* b, const double* D, const double* z, double* y) final; private: // Chunk objects store combinatorial information needed to // efficiently eliminate a whole chunk out of the least squares // problem. Consider the first chunk in the example matrix above. // // [ y1 0 0 0 | z1 0 0 0 z5] // [ y1 0 0 0 | z1 z2 0 0 0] // // One of the intermediate quantities that needs to be calculated is // for each row the product of the y block transposed with the // non-zero z block, and the sum of these blocks across rows. A // temporary array "buffer_" is used for computing and storing them // and the buffer_layout maps the indices of the z-blocks to // position in the buffer_ array. The size of the chunk is the // number of row blocks/residual blocks for the particular y block // being considered. // // For the example chunk shown above, // // size = 2 // // The entries of buffer_layout will be filled in the following order. // // buffer_layout[z1] = 0 // buffer_layout[z5] = y1 * z1 // buffer_layout[z2] = y1 * z1 + y1 * z5 using BufferLayoutType = std::map; struct Chunk { explicit Chunk(int start) : size(0), start(start) {} int size; int start; BufferLayoutType buffer_layout; }; void ChunkDiagonalBlockAndGradient( const Chunk& chunk, const BlockSparseMatrixData& A, const double* b, int row_block_counter, typename EigenTypes::Matrix* eet, double* g, double* buffer, BlockRandomAccessMatrix* lhs); void UpdateRhs(const Chunk& chunk, const BlockSparseMatrixData& A, const double* b, int row_block_counter, const double* inverse_ete_g, double* rhs); void ChunkOuterProduct(int thread_id, const CompressedRowBlockStructure* bs, const Matrix& inverse_eet, const double* buffer, const BufferLayoutType& buffer_layout, BlockRandomAccessMatrix* lhs); void EBlockRowOuterProduct(const BlockSparseMatrixData& A, int row_block_index, BlockRandomAccessMatrix* lhs); void NoEBlockRowsUpdate(const BlockSparseMatrixData& A, const double* b, int row_block_counter, BlockRandomAccessMatrix* lhs, double* rhs); void NoEBlockRowOuterProduct(const BlockSparseMatrixData& A, int row_block_index, BlockRandomAccessMatrix* lhs); int num_threads_; ContextImpl* context_; int num_eliminate_blocks_; bool assume_full_rank_ete_; // Block layout of the columns of the reduced linear system. Since // the f blocks can be of varying size, this vector stores the // position of each f block in the row/col of the reduced linear // system. Thus lhs_row_layout_[i] is the row/col position of the // i^th f block. std::vector lhs_row_layout_; // Combinatorial structure of the chunks in A. For more information // see the documentation of the Chunk object above. std::vector chunks_; // TODO(sameeragarwal): The following two arrays contain per-thread // storage. They should be refactored into a per thread struct. // Buffer to store the products of the y and z blocks generated // during the elimination phase. buffer_ is of size num_threads * // buffer_size_. Each thread accesses the chunk // // [thread_id * buffer_size_ , (thread_id + 1) * buffer_size_] // std::unique_ptr buffer_; // Buffer to store per thread matrix matrix products used by // ChunkOuterProduct. Like buffer_ it is of size num_threads * // buffer_size_. Each thread accesses the chunk // // [thread_id * buffer_size_ , (thread_id + 1) * buffer_size_ -1] // std::unique_ptr chunk_outer_product_buffer_; int buffer_size_; int uneliminated_row_begins_; // Locks for the blocks in the right hand side of the reduced linear // system. std::vector rhs_locks_; }; // SchurEliminatorForOneFBlock specializes the SchurEliminatorBase interface for // the case where there is exactly one f-block and all three parameters // kRowBlockSize, kEBlockSize and KFBlockSize are known at compile time. This is // the case for some two view bundle adjustment problems which have very // stringent latency requirements. // // Under these assumptions, we can simplify the more general algorithm // implemented by SchurEliminatorImpl significantly. Two of the major // contributors to the increased performance are: // // 1. Simpler loop structure and less use of dynamic memory. Almost everything // is on the stack and benefits from aligned memory as well as fixed sized // vectorization. We are also able to reason about temporaries and control // their lifetimes better. // 2. Use of inverse() over llt().solve(Identity). template class CERES_NO_EXPORT SchurEliminatorForOneFBlock final : public SchurEliminatorBase { public: // TODO(sameeragarwal) Find out why "assume_full_rank_ete" is not used here void Init(int num_eliminate_blocks, bool /*assume_full_rank_ete*/, const CompressedRowBlockStructure* bs) override { CHECK_GT(num_eliminate_blocks, 0) << "SchurComplementSolver cannot be initialized with " << "num_eliminate_blocks = 0."; CHECK_EQ(bs->cols.size() - num_eliminate_blocks, 1); num_eliminate_blocks_ = num_eliminate_blocks; const int num_row_blocks = bs->rows.size(); chunks_.clear(); int r = 0; // Iterate over the row blocks of A, and detect the chunks. The // matrix should already have been ordered so that all rows // containing the same y block are vertically contiguous. while (r < num_row_blocks) { const int e_block_id = bs->rows[r].cells.front().block_id; if (e_block_id >= num_eliminate_blocks_) { break; } chunks_.push_back(Chunk()); Chunk& chunk = chunks_.back(); chunk.num_rows = 0; chunk.start = r; // Add to the chunk until the first block in the row is // different than the one in the first row for the chunk. while (r + chunk.num_rows < num_row_blocks) { const CompressedRow& row = bs->rows[r + chunk.num_rows]; if (row.cells.front().block_id != e_block_id) { break; } ++chunk.num_rows; } r += chunk.num_rows; } const Chunk& last_chunk = chunks_.back(); uneliminated_row_begins_ = last_chunk.start + last_chunk.num_rows; e_t_e_inverse_matrices_.resize(kEBlockSize * kEBlockSize * chunks_.size()); std::fill( e_t_e_inverse_matrices_.begin(), e_t_e_inverse_matrices_.end(), 0.0); } void Eliminate(const BlockSparseMatrixData& A, const double* b, const double* D, BlockRandomAccessMatrix* lhs_bram, double* rhs_ptr) override { // Since there is only one f-block, we can call GetCell once, and cache its // output. int r, c, row_stride, col_stride; CellInfo* cell_info = lhs_bram->GetCell(0, 0, &r, &c, &row_stride, &col_stride); typename EigenTypes::MatrixRef lhs( cell_info->values, kFBlockSize, kFBlockSize); typename EigenTypes::VectorRef rhs(rhs_ptr, kFBlockSize); lhs.setZero(); rhs.setZero(); const CompressedRowBlockStructure* bs = A.block_structure(); const double* values = A.values(); // Add the diagonal to the Schur complement. if (D != nullptr) { typename EigenTypes::ConstVectorRef diag( D + bs->cols[num_eliminate_blocks_].position, kFBlockSize); lhs.diagonal() = diag.array().square().matrix(); } Eigen::Matrix tmp; Eigen::Matrix tmp2; // The following loop works on a block matrix which looks as follows // (number of rows can be anything): // // [e_1 | f_1] = [b1] // [e_2 | f_2] = [b2] // [e_3 | f_3] = [b3] // [e_4 | f_4] = [b4] // // and computes the following // // e_t_e = sum_i e_i^T * e_i // e_t_e_inverse = inverse(e_t_e) // e_t_f = sum_i e_i^T f_i // e_t_b = sum_i e_i^T b_i // f_t_b = sum_i f_i^T b_i // // lhs += sum_i f_i^T * f_i - e_t_f^T * e_t_e_inverse * e_t_f // rhs += f_t_b - e_t_f^T * e_t_e_inverse * e_t_b for (int i = 0; i < chunks_.size(); ++i) { const Chunk& chunk = chunks_[i]; const int e_block_id = bs->rows[chunk.start].cells.front().block_id; // Naming convention, e_t_e = e_block.transpose() * e_block; Eigen::Matrix e_t_e; Eigen::Matrix e_t_f; Eigen::Matrix e_t_b; Eigen::Matrix f_t_b; // Add the square of the diagonal to e_t_e. if (D != nullptr) { const typename EigenTypes::ConstVectorRef diag( D + bs->cols[e_block_id].position, kEBlockSize); e_t_e = diag.array().square().matrix().asDiagonal(); } else { e_t_e.setZero(); } e_t_f.setZero(); e_t_b.setZero(); f_t_b.setZero(); for (int j = 0; j < chunk.num_rows; ++j) { const int row_id = chunk.start + j; const auto& row = bs->rows[row_id]; const typename EigenTypes::ConstMatrixRef e_block(values + row.cells[0].position, kRowBlockSize, kEBlockSize); const typename EigenTypes::ConstVectorRef b_block( b + row.block.position, kRowBlockSize); e_t_e.noalias() += e_block.transpose() * e_block; e_t_b.noalias() += e_block.transpose() * b_block; if (row.cells.size() == 1) { // There is no f block, so there is nothing more to do. continue; } const typename EigenTypes::ConstMatrixRef f_block(values + row.cells[1].position, kRowBlockSize, kFBlockSize); e_t_f.noalias() += e_block.transpose() * f_block; lhs.noalias() += f_block.transpose() * f_block; f_t_b.noalias() += f_block.transpose() * b_block; } // BackSubstitute computes the same inverse, and this is the key workload // there, so caching these inverses makes BackSubstitute essentially free. typename EigenTypes::MatrixRef e_t_e_inverse( &e_t_e_inverse_matrices_[kEBlockSize * kEBlockSize * i], kEBlockSize, kEBlockSize); // e_t_e is a symmetric positive definite matrix, so the standard way to // compute its inverse is via the Cholesky factorization by calling // e_t_e.llt().solve(Identity()). However, the inverse() method even // though it is not optimized for symmetric matrices is significantly // faster for small fixed size (up to 4x4) matrices. // // https://eigen.tuxfamily.org/dox/group__TutorialLinearAlgebra.html#title3 e_t_e_inverse.noalias() = e_t_e.inverse(); // The use of these two pre-allocated tmp vectors saves temporaries in the // expressions for lhs and rhs updates below and has a significant impact // on the performance of this method. tmp.noalias() = e_t_e_inverse * e_t_f; tmp2.noalias() = e_t_e_inverse * e_t_b; lhs.noalias() -= e_t_f.transpose() * tmp; rhs.noalias() += f_t_b - e_t_f.transpose() * tmp2; } // The rows without any e-blocks can have arbitrary size but only contain // the f-block. // // lhs += f_i^T f_i // rhs += f_i^T b_i for (int row_id = uneliminated_row_begins_; row_id < bs->rows.size(); ++row_id) { const auto& row = bs->rows[row_id]; const auto& cell = row.cells[0]; const typename EigenTypes::ConstMatrixRef f_block(values + cell.position, row.block.size, kFBlockSize); const typename EigenTypes::ConstVectorRef b_block( b + row.block.position, row.block.size); lhs.noalias() += f_block.transpose() * f_block; rhs.noalias() += f_block.transpose() * b_block; } } // This implementation of BackSubstitute depends on Eliminate being called // before this. SchurComplementSolver always does this. // // y_i = e_t_e_inverse * sum_i e_i^T * (b_i - f_i * z); void BackSubstitute(const BlockSparseMatrixData& A, const double* b, const double* /*D*/, const double* z_ptr, double* y) override { typename EigenTypes::ConstVectorRef z(z_ptr, kFBlockSize); const CompressedRowBlockStructure* bs = A.block_structure(); const double* values = A.values(); Eigen::Matrix tmp; for (int i = 0; i < chunks_.size(); ++i) { const Chunk& chunk = chunks_[i]; const int e_block_id = bs->rows[chunk.start].cells.front().block_id; tmp.setZero(); for (int j = 0; j < chunk.num_rows; ++j) { const int row_id = chunk.start + j; const auto& row = bs->rows[row_id]; const typename EigenTypes::ConstMatrixRef e_block(values + row.cells[0].position, kRowBlockSize, kEBlockSize); const typename EigenTypes::ConstVectorRef b_block( b + row.block.position, kRowBlockSize); if (row.cells.size() == 1) { // There is no f block. tmp += e_block.transpose() * b_block; } else { typename EigenTypes::ConstMatrixRef f_block( values + row.cells[1].position, kRowBlockSize, kFBlockSize); tmp += e_block.transpose() * (b_block - f_block * z); } } typename EigenTypes::MatrixRef e_t_e_inverse( &e_t_e_inverse_matrices_[kEBlockSize * kEBlockSize * i], kEBlockSize, kEBlockSize); typename EigenTypes::VectorRef y_block( y + bs->cols[e_block_id].position, kEBlockSize); y_block.noalias() = e_t_e_inverse * tmp; } } private: struct Chunk { int start = 0; int num_rows = 0; }; std::vector chunks_; int num_eliminate_blocks_; int uneliminated_row_begins_; std::vector e_t_e_inverse_matrices_; }; } // namespace ceres::internal #include "ceres/internal/reenable_warnings.h" #endif // CERES_INTERNAL_SCHUR_ELIMINATOR_H_