Test with OpenMP GPU - added for occluding boundary vertices

c5628500 · Richard Torenvliet · e861a303 · c5628500 · c5628500 · c5628500
Commit c5628500 authored Apr 21, 2017 by Richard Torenvliet
10 changed files
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -113,6 +113,7 @@ set(HEADERS
  ${CMAKE_CURRENT_SOURCE_DIR}/include/eos/video/Keyframe.hpp
 )

+# Set OpenMP compiler flags if possible
 find_package(OpenMP)
 if (OPENMP_FOUND)
  set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}")

--- a/include/eos/fitting/FittingResult.hpp
+++ b/include/eos/fitting/FittingResult.hpp
@@ -42,9 +42,10 @@ namespace eos {
 struct FittingResult
 {
 	RenderingParameters rendering_parameters;
-	core::LandmarkCollection<cv::Vec2f> landmarks;
 	std::vector<float> pca_shape_coefficients;
 	std::vector<float> blendshape_coefficients;
+	core::LandmarkCollection<cv::Vec2f> landmarks;
+	core::Mesh mesh;
 };

 	} /* namespace fitting */

--- a/include/eos/fitting/closest_edge_fitting.hpp
+++ b/include/eos/fitting/closest_edge_fitting.hpp
@@ -45,6 +45,7 @@
 #include <utility>
 #include <cstddef>

+
 namespace eos {
 namespace fitting {

@@ -391,7 +392,11 @@ inline std::pair<std::vector<cv::Vec2f>, std::vector<int>> find_occluding_edge_c
 	}
 	return { image_points, vertex_indices };
 };
+
+#ifdef _OPENMP
 /**
+ * OpenMP version
+ *
 * @brief Computes the vertices that lie on occluding boundaries, given a particular pose.
 *
 * This algorithm computes the edges that lie on occluding boundaries of the mesh.
@@ -404,7 +409,7 @@ inline std::pair<std::vector<cv::Vec2f>, std::vector<int>> find_occluding_edge_c
 * @param[in] R The rotation (pose) under which the occluding boundaries should be computed.
 * @return A vector with unique vertex id's making up the edges.
 */
-inline std::vector<int> occluding_boundary_vertices_parallel(const core::Mesh& mesh, const morphablemodel::EdgeTopology& edge_topology, glm::mat4x4 R)
+std::vector<int> occluding_boundary_vertices_parallel(const core::Mesh& mesh, const morphablemodel::EdgeTopology& edge_topology, glm::mat4x4 R)
 {
 	// Rotate the mesh:
 	std::vector<glm::vec4> rotated_vertices;
@@ -423,7 +428,8 @@ inline std::vector<int> occluding_boundary_vertices_parallel(const core::Mesh& m

 	// Find occluding edges:
 	std::vector<int> occluding_edges_indices;
-	for (int edge_idx = 0; edge_idx < edge_topology.adjacent_faces.size(); ++edge_idx) // For each edge... Ef contains the indices of the two adjacent faces
+	for (int edge_idx = 0; edge_idx < edge_topology.adjacent_faces.size();
+		 ++edge_idx) // For each edge... Ef contains the indices of the two adjacent faces
 	{
 		const auto &edge = edge_topology.adjacent_faces[edge_idx];
 		if (edge[0] == 0) // ==> NOTE/Todo Need to change this if we use 0-based indexing!
@@ -455,65 +461,81 @@ inline std::vector<int> occluding_boundary_vertices_parallel(const core::Mesh& m
 	// Remove duplicate vertex id's (std::unique works only on sorted sequences):
 	std::sort(begin(occluding_vertices), end(occluding_vertices));
 	occluding_vertices.erase(std::unique(begin(occluding_vertices), end(occluding_vertices)), end(occluding_vertices));
-
 	// Perform ray-casting to find out which vertices are not visible (i.e. self-occluded):
-	std::vector<bool> visibility;
+	// Use map to allow parallelism, all vertex_ids are unique
+	std::vector<int> vertex_id_visible_map(occluding_vertices.size());

-	int NUM_THREADS = 4;
 	auto t1 = std::chrono::high_resolution_clock::now();
-
-	 // we shoot the ray from the vertex towards the camera
-	glm::vec3 ray_direction(0.0f, 0.0f, 1.0f);
-
-#pragma omp parallel for schedule num_threads(NUM_THREADS)
-		for(int i = 0; i < occluding_vertices.size(); i++) {
+//	#pragma omp target map(to: occluding_vertices, mesh, rotated_vertices)
+//	#pragma omp parallel for
+#pragma omp target map(alloc:vertex_id_visible_map) map(from:occluding_vertices, rotated_vertices, mesh)
+	{
+#pragma omp parallel for
+		for (int i = 0; i < occluding_vertices.size(); i++) {
 			const auto vertex_idx = occluding_vertices[i];
-			bool visible = true;
+			int visible = 1;

 			glm::vec3 ray_origin(rotated_vertices[vertex_idx]);
+			// we shoot the ray from the vertex towards the camera
+			glm::vec3 ray_direction(0.0f, 0.0f, 1.0f);

 			// For every tri of the rotated mesh:
-			for(int j = 0; j < mesh.tvi.size(); j++) {
+			for (int j = 0; j < mesh.tvi.size(); j++) {
 				auto tri = mesh.tvi[j];
 				auto &v0 = rotated_vertices[tri[0]];
 				auto &v1 = rotated_vertices[tri[1]];
 				auto &v2 = rotated_vertices[tri[2]];

-				auto intersect = ray_triangle_intersect(ray_origin, ray_direction, glm::vec3(v0), glm::vec3(v1), glm::vec3(v2), false);
+				auto intersect = ray_triangle_intersect(ray_origin,
+														ray_direction,
+														glm::vec3(v0),
+														glm::vec3(v1),
+														glm::vec3(v2),
+														false);

 				// first is bool intersect, second is the distance t
 				if (intersect.first == true) {
 					// We've hit a triangle. Ray hit its own triangle. If it's behind the ray origin, ignore the intersection:
 					// Check if in front or behind?
-					if (intersect.second.get() <= 1e-4)
-					{
+					if (intersect.second.get() <= 1e-4) {
 						continue; // the intersection is behind the vertex, we don't care about it
 					}
 					// Otherwise, we've hit a genuine triangle, and the vertex is not visible:
-					visible = false;
+					visible = 0;
 					break;
 				}
 			}

-			visibility.push_back(visible);
+			vertex_id_visible_map[i] = visible;
+		}
 	}

 	auto t2 = std::chrono::high_resolution_clock::now();
-	std::cout << "test derp 1: " << std::chrono::duration_cast<std::chrono::milliseconds>(t2 - t1).count() << "ms" << std::endl;

-	// Remove vertices from occluding boundary list that are not visible:
+	// copy the results to final vertex ids
 	std::vector<int> final_vertex_ids;
-	for (int i = 0; i < occluding_vertices.size(); ++i)
-	{
-		if (visibility[i] == true)
-		{
-			final_vertex_ids.push_back(occluding_vertices[i]);
+	for (int i = 0; i < occluding_vertices.size(); ++i) {
+		const auto vertex_idx = occluding_vertices[i];
+		int visible = vertex_id_visible_map[i];
+		if (visible == 1) {
+			final_vertex_ids.push_back(vertex_idx);
 		}
 	}
+
+	auto final_timing = std::chrono::duration_cast<std::chrono::milliseconds>(t2-t1).count();
+	printf("S %lu %lld ms (mean: %f)\n",
+		   final_vertex_ids.size(),
+		   final_timing,
+		   static_cast<float>(final_timing) / final_vertex_ids.size()
+	);
+
 	return final_vertex_ids;
 };

 /**
+ *
+ * OpenMP version
+ *
 * @brief For a given list of 2D edge points, find corresponding 3D vertex IDs.
 *
 * This algorithm first computes the 3D mesh's occluding boundary vertices under
@@ -593,6 +615,7 @@ inline std::pair<std::vector<cv::Vec2f>, std::vector<int>> find_occluding_edge_c
 	}
 	return { image_points, vertex_indices };
 };
+#endif // _OPENMP




--- a/include/eos/fitting/fitting.hpp
+++ b/include/eos/fitting/fitting.hpp
--- a/include/eos/fitting/linear_shape_fitting.hpp
+++ b/include/eos/fitting/linear_shape_fitting.hpp
@@ -315,8 +315,8 @@ inline std::vector<float> fit_shape_to_landmarks_linear_multi_parallel(
 #pragma omp parallel num_threads(num_threads)
 		{
 #pragma omp for
-			for (int i = 0; i < num_landmarks; ++i)
-			{
+			for (int i = 0; i < num_landmarks; ++i) {
+				// calculate index from landmark offset (not always the same landmarks amount).
 				int index = (landmark_offset_index[k] * 4) + 4 * i;
 				int offset = landmark_offset_index[k] + i;
 				auto vertex_id = vertex_ids[k][i];
@@ -328,16 +328,14 @@ inline std::vector<float> fit_shape_to_landmarks_linear_multi_parallel(
 					basis_rows_.block(0, 0, basis_rows_.rows(), num_coeffs_to_fit);

 				// Set y with landmark coordinates (will be used in linear lsq later)
-#pragma omp simd
-				for (int j = 0; j < 2; j++)
-				{
+				#pragma omp simd
+				for (int j = 0; j < 2; j++) {
 					y(3 * offset + j) = landmarks[k][i][j];
 				}

 				// Set v_bar with face mesh (will be used in linear lsq later)
-#pragma omp simd
-				for (int j = 0; j < 3; j++)
-				{
+				#pragma omp simd
+				for (int j = 0; j < 3; j++) {
 					v_bar(4 * offset + j) = base_face[k](vertex_id * 3 + j);
 				}
 			}
@@ -368,13 +366,13 @@ inline std::vector<float> fit_shape_to_landmarks_linear_multi_parallel(

 	auto t2 = std::chrono::high_resolution_clock::now();

-	t1 = std::chrono::high_resolution_clock::now();
 	// fill P with coefficients
 	P.setFromTriplets(P_coefficients.begin(), P_coefficients.end());
 	MatrixXf one;
 	MatrixXf two;
 	MatrixXf A;
 	MatrixXf b;
+	MatrixXf rhs;

 #pragma omp parallel sections num_threads(2)
 {
@@ -382,6 +380,7 @@ inline std::vector<float> fit_shape_to_landmarks_linear_multi_parallel(
 {
 	A = P * V_hat_h; // camera matrix times the basis
 	one = A.transpose() * Omega.asDiagonal();
+	rhs = -one * b; // It's -A^t*Omega^t*b, but we don't need to transpose Omega, since it's a diagonal matrix, and Omega^t = Omega.
 }

 #pragma omp section
@@ -392,12 +391,8 @@ inline std::vector<float> fit_shape_to_landmarks_linear_multi_parallel(
 }

 	const MatrixXf AtOmegaAReg = one * A + two;
-	const MatrixXf rhs = -one * b; // It's -A^t*Omega^t*b, but we don't need to transpose Omega, since it's a diagonal matrix, and Omega^t = Omega.
 	const VectorXf c_s = AtOmegaAReg.colPivHouseholderQr().solve(rhs);

-	t2 = std::chrono::high_resolution_clock::now();
-	std::cout << "matrix xf " << std::chrono::duration_cast<std::chrono::milliseconds>(t2-t1).count() << "ms" << std::endl;
-
 	return std::vector<float>(c_s.data(), c_s.data() + c_s.size());
 };
 #endif

--- a/include/eos/render/render.hpp
+++ b/include/eos/render/render.hpp
@@ -141,6 +141,7 @@ inline std::pair<cv::Mat, cv::Mat> render(
 	// bool enable_texturing = false; Maybe re-add later, not sure
 	// take a cv::Mat texture instead and convert to Texture internally? no, we don't want to recreate mipmap levels on each render() call.

+	auto t1 = std::chrono::high_resolution_clock::now();
 	assert(mesh.vertices.size() == mesh.colors.size() || mesh.colors.empty()); // The number of vertices has to be equal for both shape and colour, or, alternatively, it has to be a shape-only model.
 	assert(mesh.vertices.size() == mesh.texcoords.size() || mesh.texcoords.empty()); // same for the texcoords
 	// another assert: If cv::Mat texture != empty, then we need texcoords?
@@ -177,7 +178,11 @@ inline std::pair<cv::Mat, cv::Mat> render(
 	// Prepare the rasterisation stage.
 	// For every vertex/tri:
 	vector<detail::TriangleToRasterize> triangles_to_raster;
-	for (const auto& tri_indices : mesh.tvi) {
+//#pragma omp target
+//	{
+//#pragma omp parallel for
+		for(int i = 0; i < mesh.tvi.size(); i++) {
+			const auto tri_indices = mesh.tvi[i];
 			// Todo: Split this whole stuff up. Make a "clip" function, ... rename "processProspective..".. what is "process"... get rid of "continue;"-stuff by moving stuff inside process...
 			// classify vertices visibility with respect to the planes of the view frustum
 			// we're in clip-coords (NDC), so just check if outside [-1, 1] x ...
@@ -186,8 +191,7 @@ inline std::pair<cv::Mat, cv::Mat> render(
 			// However, when comparing against w_c below, we might run into the trouble of the sign again in the affine case.
 			// 'w' is always positive, as it is -z_camspace, and all z_camspace are negative.
 			unsigned char visibility_bits[3];
-		for (unsigned char k = 0; k < 3; k++)
-		{
+			for (unsigned char k = 0; k < 3; k++) {
 				visibility_bits[k] = 0;
 				float x_cc = clipspace_vertices[tri_indices[k]].position[0];
 				float y_cc = clipspace_vertices[tri_indices[k]].position[1];
@@ -214,8 +218,15 @@ inline std::pair<cv::Mat, cv::Mat> render(
 			// all vertices are visible - pass the whole triangle to the rasterizer. = All bits of all 3 triangles are 0.
 			if ((visibility_bits[0] | visibility_bits[1] | visibility_bits[2]) == 0)
 			{
-			boost::optional<detail::TriangleToRasterize> t = detail::process_prospective_tri(clipspace_vertices[tri_indices[0]], clipspace_vertices[tri_indices[1]], clipspace_vertices[tri_indices[2]], viewport_width, viewport_height, enable_backface_culling);
-			if (t) {
+				boost::optional<detail::TriangleToRasterize> t =
+					detail::process_prospective_tri(clipspace_vertices[tri_indices[0]],
+													clipspace_vertices[tri_indices[1]],
+													clipspace_vertices[tri_indices[2]],
+													viewport_width,
+													viewport_height,
+													enable_backface_culling);
+				if (t)
+				{
 					triangles_to_raster.push_back(*t);
 				}
 				continue;
@@ -228,7 +239,11 @@ inline std::pair<cv::Mat, cv::Mat> render(
 			// split the triangle if it intersects the near plane:
 			if (enable_near_clipping)
 			{
-			vertices = detail::clip_polygon_to_plane_in_4d(vertices, glm::tvec4<float>(0.0f, 0.0f, -1.0f, -1.0f)); // "Normal" (or "4D hyperplane") of the near-plane. I tested it and it works like this but I'm a little bit unsure because Songho says the normal of the near-plane is (0,0,-1,1) (maybe I have to switch around the < 0 checks in the function?)
+				vertices = detail::clip_polygon_to_plane_in_4d(vertices,
+															   glm::tvec4<float>(0.0f,
+																				 0.0f,
+																				 -1.0f,
+																				 -1.0f)); // "Normal" (or "4D hyperplane") of the near-plane. I tested it and it works like this but I'm a little bit unsure because Songho says the normal of the near-plane is (0,0,-1,1) (maybe I have to switch around the < 0 checks in the function?)
 			}

 			// triangulation of the polygon formed of vertices array
@@ -236,18 +251,29 @@ inline std::pair<cv::Mat, cv::Mat> render(
 			{
 				for (unsigned char k = 0; k < vertices.size() - 2; k++)
 				{
-				boost::optional<detail::TriangleToRasterize> t = detail::process_prospective_tri(vertices[0], vertices[1 + k], vertices[2 + k], viewport_width, viewport_height, enable_backface_culling);
-				if (t) {
+					boost::optional<detail::TriangleToRasterize> t = detail::process_prospective_tri(vertices[0],
+																									 vertices[1 + k],
+																									 vertices[2 + k],
+																									 viewport_width,
+																									 viewport_height,
+																									 enable_backface_culling);
+					if (t)
+					{
 						triangles_to_raster.push_back(*t);
 					}
 				}
 			}
 		}
+//	}
+

 	// Fragment/pixel shader: Colour the pixel values
 	for (const auto& tri : triangles_to_raster) {
 		detail::raster_triangle(tri, colourbuffer, depthbuffer, texture, enable_far_clipping);
 	}
+	auto t2 = std::chrono::high_resolution_clock::now();
+	auto final_timing = std::chrono::duration_cast<std::chrono::milliseconds>(t2-t1).count();
+	printf("Tri %lu %lld ms\n", triangles_to_raster.size(), final_timing);

 	return std::make_pair(colourbuffer, depthbuffer);
 };

--- a/include/eos/video/BufferedVideoIterator.hpp
+++ b/include/eos/video/BufferedVideoIterator.hpp
--- a/utils/CMakeLists.txt
+++ b/utils/CMakeLists.txt
@@ -39,8 +39,13 @@ add_executable(accuracy-evaluation accuracy-evaluation.cpp)
 target_link_libraries(accuracy-evaluation eos ${OpenCV_LIBS} ${Boost_LIBRARIES})
 target_link_libraries(accuracy-evaluation "$<$<CXX_COMPILER_ID:GNU>:-pthread>$<$<CXX_COMPILER_ID:Clang>:-pthreads>")

+add_executable(openmp-test openmp-test.cpp)
+target_link_libraries(openmp-test eos ${OpenCV_LIBS} ${Boost_LIBRARIES})
+target_link_libraries(openmp-test "$<$<CXX_COMPILER_ID:GNU>:-pthread>$<$<CXX_COMPILER_ID:Clang>:-pthreads>")
+
 # Install targets:
 install(TARGETS scm-to-cereal DESTINATION bin)
 install(TARGETS bfm-binary-to-cereal DESTINATION bin)
 install(TARGETS edgestruct-csv-to-json DESTINATION bin)
 install(TARGETS accuracy-evaluation DESTINATION bin)
+install(TARGETS openmp-test DESTINATION bin)
--- a/utils/accuracy-evaluation.cpp
+++ b/utils/accuracy-evaluation.cpp
@@ -48,6 +48,7 @@ using eos::core::Landmark;
 using eos::core::LandmarkCollection;
 using eos::video::BufferedVideoIterator;
 using eos::video::WeightedIsomapAveraging;
+using eos::video::ReconstructionVideoWriter;
 using cv::Mat;
 using cv::Vec2f;
 using cv::Vec3f;
@@ -192,8 +193,13 @@ void render_output(
 		boost::property_tree::ptree settings,
 		int n_iter) {

+	std::string output_path = settings.get<std::string>("output.output_path", "/tmp");
+	float merge_isomap_face_angle = settings.get<float>("output.merge_isomap_face_angle", 60.f);
+	bool make_video = settings.get<bool>("output.save_video", false);
+	bool show_video = settings.get<bool>("output.show_video", false);
+
 	Mat merged_isomap;
-	WeightedIsomapAveraging isomap_averaging(60.f); // merge all triangles that are facing <60° towards the camera
+	WeightedIsomapAveraging isomap_averaging(merge_isomap_face_angle); // merge all triangles that are facing <60° towards the camera
 	eos::video::PcaCoefficientMerging pca_shape_merging;

 	auto landmark_mapper = reconstruction_data.landmark_mapper;
@@ -208,10 +214,9 @@ void render_output(

 		auto unmodified_frame = frame.clone();
 		int frame_number = key_frames[i].frame_number;
-		float yaw_angle = glm::degrees(glm::yaw(rendering_paramss[i].get_rotation()));
+		float yaw_angle = key_frames[i].yaw_angle;// glm::degrees(glm::yaw(rendering_paramss[i].get_rotation()));

-		cv::imwrite("/tmp/eos/" +
-			std::to_string(frame_number) + "_" + std::to_string(yaw_angle) + ".png", frame);
+		cv::imwrite("/tmp/eos/" + std::to_string(frame_number) + "_" + std::to_string(yaw_angle) + ".png", frame);

 		// Extract the texture using the fitted mesh from this frame:
 		Mat affine_cam = fitting::get_3x4_affine_camera_matrix(rendering_paramss[i], frame.cols, frame.rows);
@@ -302,8 +307,20 @@ void render_output(
 			false,
 			rendering);

-	cv::imshow("render", rendering);
-	cv::waitKey(10);
+//	if(save_video) {
+//		cv::imshow("render", rendering);
+//		cv::waitKey(1);
+//	}
+//
+//	if(show_video) {
+//		cv::imshow("render", rendering);
+//		cv::waitKey(1);
+//	}
+//
+//	eos::video::ReconstructionVideoWriter outputVideo;
+//	Size S = Size((int) inputVideo.get(CV_CAP_PROP_FRAME_WIDTH),    //Acquire input size
+//				  (int) inputVideo.get(CV_CAP_PROP_FRAME_HEIGHT));
+//	outputVideo.open(NAME , ex, inputVideo.get(CV_CAP_PROP_FPS),S, true);
 }

 /**
@@ -334,8 +351,10 @@ void evaluate_results(
 		boost::property_tree::ptree settings,
 		int n_iter) {

-	WeightedIsomapAveraging isomap_averaging(10.f); // merge all triangles that are facing <60° towards the camera
+	std::string output_path = settings.get<std::string>("output.output_path", "/tmp");
+	float merge_isomap_face_angle = settings.get<float>("output.merge_isomap_face_angle", 60.f);

+	WeightedIsomapAveraging isomap_averaging(merge_isomap_face_angle); // merge all triangles that are facing <60° towards the camera
 	Mat merged_isomap;
 	fs::path outputfilebase = annotations[0];

@@ -377,11 +396,11 @@ void evaluate_results(
 				rendering_params.get_projection(),
 				fitting::get_opencv_viewport(frame_width, frame_height));

-		bool eyes_open = isomap_averaging.has_eyes_open(frame, landmarks);
-
-		if (!eyes_open) {
-			continue;
-		}
+//		bool eyes_open = isomap_averaging.has_eyes_open(frame, landmarks);
+//
+//		if (!eyes_open) {
+//			continue;
+//		}

 //		cv::imshow("Img", outimg);
 //		cv::waitKey(0);
@@ -649,19 +668,21 @@ int main(int argc, char *argv[]) {

 	// Start getting video frames:
 	vid_iterator.start();
+//	vid_writer.start();

 	// Count the amount of iterations:
 	int n_iter = 0;
+
 	while(vid_iterator.is_playing()) {
 		auto key_frames = vid_iterator.get_keyframes();

 		// Generate a sublist of the total available landmark list:
 		auto landmark_sublist = sample_landmarks(key_frames, landmark_list);
-//		std::cout << vid_iterator.to_string() << std::endl;

 		// it makes no sense to update pca_coeff if nothing in the buffer has changed:
 		if (vid_iterator.has_changed()) {
 			std::cout << "Going to reconstruct with " << key_frames.size() << " images."<< std::endl;
+
 			// Fit shape and pose:
 			auto t1 = std::chrono::high_resolution_clock::now();
 			std::tie(meshs, rendering_paramss) = fitting::fit_shape_and_pose_multi_parallel(
@@ -681,7 +702,8 @@ int main(int argc, char *argv[]) {
 				boost::none,
 				pca_shape_coefficients,
 				blendshape_coefficients,
-				fitted_image_points
+				fitted_image_points,
+				settings
 			);

 			auto t2 = std::chrono::high_resolution_clock::now();
@@ -690,62 +712,62 @@ int main(int argc, char *argv[]) {
 					  << "ms, mean(" << std::chrono::duration_cast<std::chrono::milliseconds>(t2-t1).count() / key_frames.size()
 					  << "ms)" << std::endl;

-			vid_iterator.update_reconstruction_coeff(pca_shape_coefficients);

-			evaluate_results(
-				key_frames,
-				rendering_paramss,
-				meshs,
-				pca_shape_coefficients,
-				blendshape_coefficients,
-				fitted_image_points,
-				annotations,
-				reconstruction_data,
-				settings,
-				n_iter
-			);
-
-		} else {
-			std::cout << "No reconstruction - buffer did not change" << std::endl;
-		}
-
-//		// Render output:
-//		render_output(
+//			evaluate_results(
 //				key_frames,
 //				rendering_paramss,
-//				landmark_sublist,
 //				meshs,
 //				pca_shape_coefficients,
 //				blendshape_coefficients,
 //				fitted_image_points,
 //				annotations,
 //				reconstruction_data,
-//				vid_iterator.last_frame,
-//				vid_iterator.last_frame_number,
 //				settings,
 //				n_iter
 //			);

+		} else {
+//			std::cout << "No reconstruction - buffer did not change" << std::endl;
+		}

 		// Get new frames:
 		n_iter++;
 	}

-	auto key_frames = vid_iterator.get_keyframes();
+//	vid_writer.__stop();
+	if(settings.get<bool>("output.make_video", false)) {
+		ReconstructionVideoWriter vid_writer;
+		try {
+			vid_writer = ReconstructionVideoWriter(videofile.string(), reconstruction_data, settings);
+		} catch(std::runtime_error &e) {
+			std::cout << e.what() << std::endl;
+			return EXIT_FAILURE;
+		}
+		// Render output:
+		std::cout << "Waiting for video to be completed..." << std::endl;
+		vid_writer.update_reconstruction_coeff(pca_shape_coefficients);

-	std::cout << "Going to reconstruct with " << key_frames.size() << " images."<< std::endl;
-	evaluate_results(
-		key_frames,
-		rendering_paramss,
-		meshs,
-		pca_shape_coefficients,
-		blendshape_coefficients,
-		fitted_image_points,
-		annotations,
-		reconstruction_data,
-		settings,
-		n_iter
-	);
+		while (vid_writer.next()) {
+			printf("%d/%d\r", vid_writer.get_frame_number(), vid_iterator.get_frame_number());
+		}
+
+	}
+
+//	auto key_frames = vid_iterator.get_keyframes();
+//	std::cout << "Going to reconstruct with " << key_frames.size() << " images."<< std::endl;
+//
+//	evaluate_results(
+//		key_frames,
+//		rendering_paramss,
+//		meshs,
+//		pca_shape_coefficients,
+//		blendshape_coefficients,
+//		fitted_image_points,
+//		annotations,
+//		reconstruction_data,
+//		settings,
+//		n_iter
+//	);

 	//todo: we could build our final obj here?


--- a/utils/openmp-test.cpp
+++ b/utils/openmp-test.cpp
+#include "opencv2/core/core.hpp"
+#include "opencv2/opencv.hpp"
+#include "opencv2/highgui/highgui.hpp"
+
+#include <complex>
+#include <iostream>
+#include <omp.h>
+
+typedef std::complex<double> complex;
+
+/**
+ * Calculates distance between two images, returns -1 if dims are not the same.
+ *
+ * @param one
+ * @param two
+ * @return
+ */
+float calculate_distance_images(cv::Mat one, cv::Mat two) {
+
+	int width = one.cols;
+	int height = one.rows;
+
+	if(width != two.cols || height != two.rows) {
+		return -1;
+	}
+
+	float distance = 0.0;
+
+	for (int y = 0; y < height - 1; y++) {
+		for (int x = 0; x < width -1; x++) {
+			distance += fabs(one.at<uchar>(y, x) - two.at<uchar>(y, x));
+		}
+	}
+
+	return distance;
+}
+
+void cpu(cv::Mat A, cv::Mat Anew) {
+	int n = A.rows;
+	int m = A.cols;
+
+	float error = 1000;
+	float tol = 30;
+	int iter = 0;
+	int iter_max = 100;
+
+	auto t1 = std::chrono::high_resolution_clock::now();
+	while (error > tol && iter < iter_max)
+	{
+		error = 0.0;
+#pragma omp parallel num_threads(4)
+		{
+#pragma omp for reduction(max:error)
+			for (int j = 1; j < n - 1; j++)
+			{
+				for (int i = 1; i < m - 1; i++)
+				{
+					Anew.at<uchar>(j, i) = 0.25 * (A.at<uchar>(j, i + 1) + A.at<uchar>(j, i - 1)
+						+ A.at<uchar>(j - 1, i) + A.at<uchar>(j + 1, i));
+					error = fmax(error, fabs(Anew.at<uchar>(j, i) - A.at<uchar>(j, i)));
+				}
+			}
+		}
+
+		iter++;
+	}
+}
+
+void gpu(cv::Mat A, cv::Mat Anew) {
+	int n = A.rows;
+	int m = A.cols;
+
+	float error = 1000;
+	float tol = 30;
+	int iter = 0;
+	int iter_max = 100;
+
+	auto t1 = std::chrono::high_resolution_clock::now();
+
+	while ( error > tol && iter < iter_max )
+	{
+		error = 0.0;
+#pragma omp target
+		{
+#pragma omp parallel for reduction(max:error)
+			for (int j = 1; j < n - 1; j++)
+			{
+				for (int i = 1; i < m - 1; i++)
+				{
+					Anew.at<uchar>(j, i) = 0.25 * (A.at<uchar>(j, i + 1) + A.at<uchar>(j, i - 1)
+						+ A.at<uchar>(j - 1, i) + A.at<uchar>(j + 1, i));
+					error = fmax(error, fabs(Anew.at<uchar>(j, i) - A.at<uchar>(j, i)));
+				}
+			}
+			iter++;
+		}
+	}
+}
+
+int main(int argc, char** argv)
+{
+
+	cv::Mat original = cv::imread("/data/img.jpg");
+	cv::cvtColor(original, original, CV_BGR2GRAY);
+
+	cv::Mat gpuImage = original.clone();
+	cv::Mat gpuImageNew = original.clone();
+
+	auto t1 = std::chrono::high_resolution_clock::now();
+	gpu(gpuImage, gpuImageNew);
+	auto t2 = std::chrono::high_resolution_clock::now();
+	std::cout << "gpu time: "
+			  << std::chrono::duration_cast<std::chrono::milliseconds>(t2-t1).count()
+			  << "ms" << std::endl;
+
+	cv::Mat cpuImage = original.clone();
+	cv::Mat cpuImageNew = original.clone();
+
+	t1 = std::chrono::high_resolution_clock::now();
+	cpu(cpuImage, cpuImageNew);
+	t2 = std::chrono::high_resolution_clock::now();
+
+	float distance = calculate_distance_images(gpuImageNew, cpuImageNew);
+	std::cout << "cpu time: "
+			  << std::chrono::duration_cast<std::chrono::milliseconds>(t2-t1).count()
+			  << "ms" << std::endl;
+
+	std::cout << "Distance gpu - cpu: " << distance << std::endl;
+
+	cv::imshow("gpu", gpuImageNew);
+	cv::imshow("cpu", cpuImageNew);
+	cv::waitKey(0);
+}