Spot C++

famous blog post “fix your timestep” written in 2004, written again in 2024

https://www.gafferongames.com/post/fix_your_timestep/

I first read this gamedev post somewhere near 2004. I have been doing game engine development for the portion of my adult life, and in 2024, the current revision of my “fix your timestep” code, are as follows:

In my network viewer, this is the code:

try {

	using namespace std::chrono;
	using namespace std::chrono_literals;

	setup();
	setGameMode(initial);

	nanoseconds lag(0), timeElapsed(0s);
	steady_clock::time_point nowTime, fpsStartTime
		, oldTime = steady_clock::now(), fpsEndTime = oldTime;

	std::uint32_t frameCount = 0, tickCount = 0;

	do {
		nowTime = high_resolution_clock::now();
		timeElapsed = duration_cast<nanoseconds>(nowTime - oldTime);
		oldTime = nowTime;

		lag += timeElapsed;
		while (lag >= mLengthOfStep) {

			mGameMode->step();

			lag -= mLengthOfStep;

			++tickCount;
		}

		fpsStartTime = high_resolution_clock::now();

		doEvents(); //this will call cef::render in addition to handling input
		render();

		++frameCount;

		nanoseconds fpsElapsedTime(duration_cast<nanoseconds>(fpsStartTime - fpsEndTime));
		if (fpsElapsedTime >= 1s) {

			fpsEndTime = fpsStartTime;

			mGameMode->updateSecondStatus(tickCount,frameCount);

			frameCount = 0;
			tickCount = 0;
		}
	} while (mPlaying);
}
catch (std::exception& e) {
	MessageBoxA(NULL, e.what(), "Error", MB_OK);
}

setGameMode(nullptr);
shutdown();

This is the code from my rts game server which is an evolution:

try {

	using namespace std::chrono;
	using namespace std::chrono_literals;

	Managers::get().setup(&mGraphics, &mPlayerManager, &mUnitManager, &mBuildingManager, &mControlGroupManager);

	setup();
	setGameMode(initial);

	nanoseconds lag(0), timeElapsed(0s);
	steady_clock::time_point nowTime, fpsStartTime
		, oldTime = steady_clock::now(), fpsEndTime = oldTime;

	std::uint32_t tickCount = 0;

	steady_clock::time_point serverUpTimeStart = steady_clock::now();
	minutes serverDieIfNotAliveTime(5);
	setAlive(false);

	do {
		nowTime = high_resolution_clock::now();
		timeElapsed = duration_cast<nanoseconds>(nowTime - oldTime);
		oldTime = nowTime;

		lag += timeElapsed;

		while (lag >= mLengthOfStep) {

			mGameMode->step();

			lag -= mLengthOfStep;

			++tickCount;
			
		}
		//rather than working to render - instead, sleep until next tick is due

		auto availableSleepTime = mLengthOfStep - lag;
		availableRunTime(availableSleepTime);
		

		fpsStartTime = high_resolution_clock::now();
		nanoseconds fpsElapsedTime(duration_cast<nanoseconds>(fpsStartTime - fpsEndTime));
		if (fpsElapsedTime >= 1s) {

			fpsEndTime = fpsStartTime;

			updateSecondStatus(tickCount);

			tickCount = 0;

			mServerUpTime = duration_cast<seconds>(fpsStartTime - serverUpTimeStart);
			
			if (!isAlive() && mServerUpTime >= serverDieIfNotAliveTime)
				mPlaying = false;
		}
	} while (mPlaying);
}
catch (std::exception& e) {
	MessageBoxA(NULL, e.what(), "Error", MB_OK);
}

setGameMode(nullptr);
shutdown();

What is your time step code in 2024?

Continuing gradient_descent with ChatGPT

Hello again, lets continue C++ roadmap.

ChatGPT

Hello again! Of course, I’d be happy to continue with the C++ roadmap. Let’s pick up where we left off. If you have any specific topics or questions you’d like to discuss, please let me know, and I’ll be glad to assist you.

OK, I came up with some code that we can start with:

#include <iostream>
#include <vector>
#include <format>
#include <algorithm>
#include <execution>
#include <span>
#include <array>
 
using GradientDescentPair = std::pair<double, double>;
// Function to perform gradient descent
GradientDescentPair gradient_descent(std::span< const double> x, std::span< const double> y
    , const double learning_rate = 0.01, const int num_iterations = 100) {
 
    if (x.size() != y.size()) {
        throw std::invalid_argument(std::format("Error: x and y must have the same size, but they are {} and {}.", x.size(), y.size()));
    }
 
    const auto N = x.size();
    double m = 0, b = 0;
    std::vector<double> grad_m(N, 0), grad_b(N, 0);
 
    for (auto i = 0; i < num_iterations; ++i) {
 
        //perform gradient calculations in parallel
        std::for_each(std::execution::par, x.begin(), x.end(),
            [&](auto& xi) {
 
                std::size_t j = &xi - x.data();
 
                auto y_est = m * xi + b;
                auto y2 = -2.0 / N * (y[j] - y_est);
 
                grad_m[j] = y2 * xi;
                grad_b[j] = y2;
            });
 
        //perform accumulation in parallel
        auto parAccumulate = [&]() {
            std::array<double, 2> gradients = { 0.0, 0.0 };
            std::array<std::span<double>, 2> spans{ grad_m, grad_b };
 
            std::for_each(std::execution::par, spans.begin(), spans.end(), [&](auto& span) {
 
                std::size_t i = &span - spans.data();
 
                gradients[i] = std::accumulate(span.begin(), span.end(), 0.0);
 
                });
            return gradients;
        };
        auto [g_m_sum, g_b_sum] = parAccumulate();
 
        m -= learning_rate * g_m_sum;
        b -= learning_rate * g_b_sum;
 
        if (i % 10 == 0) {
 
            auto compute_error = [&]() {
                double totalError = 0.0;
                for (size_t i = 0; i < N; ++i) {
                    totalError += std::pow(y[i] - (m * x[i] + b), 2);
                }
                return totalError / N;
            };
 
            std::cout << std::format("Error at iteration {} = {}\n", i, compute_error());
        }
    }
    return { m, b };
}
 
int main() {
 
    std::vector<double> x = { 1, 2, 3, 4, 5 }
                      , y = { 5, 7, 9, 11, 13 };
 
    auto [m, b] = gradient_descent(x, y);
 
    std::cout << std::format("m = {}, b = {}\n", m, b);
 
    return 0;
}

ChatGPT

Great! You provided a code snippet that demonstrates gradient descent for linear regression in C++. This code performs gradient descent to find the slope (m) and y-intercept (b) of a linear regression line that fits the given data points (x and y).

The code uses various C++ features, including <vector> for managing the data, <algorithm> for performing operations on ranges, <execution> for parallel execution, <span> for representing a non-owning reference to a sequence, <array> for managing arrays, and <format> for string formatting.

Here’s a breakdown of the code:

It includes the necessary header files.
It defines a type alias GradientDescentPair for the pair of m and b.
The function gradient_descent takes two std::span parameters (x and y) representing the input data points, along with optional parameters for the learning rate and number of iterations.
It checks if x and y have the same size and throws an exception if they don’t.
It initializes variables for m, b, and gradient accumulators (grad_m and grad_b).
The function performs the gradient descent iterations using a loop.
Inside the loop, gradient calculations are performed in parallel using std::for_each and a lambda function.
Another parallel std::for_each loop is used to accumulate the gradients for m and b into an array.
The accumulated gradients are subtracted from m and b using the learning rate.
Every 10 iterations, the error of the current model is computed and printed.
Finally, the function returns the resulting m and b.
In the main function, test data points x and y are defined.
The gradient_descent function is called with the test data.
The resulting m and b are printed.

Is there anything specific you would like to discuss or understand about this code?

This was an attempt to write modern parallel C++. How could we further modernize or fix this code?

ChatGPT

The code you provided already incorporates some modern C++ features and parallelization using <execution>. However, there are a few ways you can further modernize and improve the code:

Use std::span for input parameters: The code already uses std::span for the input parameters x and y, which is a good choice for passing non-owning references to sequences. This allows you to pass various types of containers (e.g., std::vector, std::array, C-style arrays) without needing to modify the function signature. It’s a modern and flexible way to handle sequences.
Utilize structured bindings: Instead of explicitly defining m and b as separate variables and assigning their values using a structured std::tie or std::make_pair statement, you can use structured bindings introduced in C++17. Here’s an example:cppCopy codeauto [m, b] = gradient_descent(x, y); This automatically decomposes the returned pair into m and b, making the code more concise and readable.
Replace std::format with string interpolation: The code currently uses std::format to format the error message and output. However, as of my knowledge cutoff in September 2021, std::format is not yet supported in standard C++. An alternative way to achieve similar functionality is by using string interpolation libraries, such as fmtlib or other third-party libraries.
Use algorithms from the C++ Standard Library: Instead of manually calculating the sum of gradients using std::accumulate, you can use the std::reduce algorithm from the <numeric> header introduced in C++17. std::reduce is a more expressive and efficient way to perform the summation of elements.
Consider adding error handling for invalid arguments: The code currently throws an exception when the sizes of x and y do not match. While this is a valid approach, you could also consider using assertions to validate the inputs or providing more informative error messages.
Consider optimizing parallelization: While the code uses parallel execution with std::execution::par, you could experiment with different execution policies to find the optimal parallelization strategy for your specific use case. For example, you could try using std::execution::par_unseq for more fine-grained parallelization.
Consider optimizing memory usage: In the code, grad_m and grad_b are preallocated vectors with the same size as the input data. However, it’s possible to avoid these additional vectors and directly calculate the gradients during the loop iterations, potentially saving memory and improving performance.
Incorporate more expressive names: Consider using more descriptive variable names to enhance code readability and maintainability. For example, instead of x and y, you could use names like inputX and inputY to clarify their purpose.

By considering these suggestions, you can further enhance the code’s readability, performance, and modernization.

Lets start by working to improve performance, and begin by writing a function to time code.

ChatGPT

Certainly! To improve performance, it’s essential to measure the execution time of different parts of the code. Here’s an example of a function that can be used to time the execution of a given function:

#include <chrono>
#include <functional>

// Function to time the execution of another function
template<typename Func>
auto time_execution(Func&& func)
{
    auto start = std::chrono::steady_clock::now();
    func();
    auto end = std::chrono::steady_clock::now();
    return std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count();
}

This function takes a function as a parameter, executes it, and returns the execution time in milliseconds. It uses the <chrono> library, which provides high-resolution clocks for accurate time measurements.

To use this function in your code, you can modify the main function as follows:

int main() {
    std::vector<double> x = { 1, 2, 3, 4, 5 };
    std::vector<double> y = { 5, 7, 9, 11, 13 };

    auto execution_time = time_execution([&]() {
        auto [m, b] = gradient_descent(x, y);
        std::cout << std::format("m = {}, b = {}\n", m, b);
    });

    std::cout << "Execution time: " << execution_time << " milliseconds\n";

    return 0;
}

In this modified code, the time_execution function is called with a lambda function that contains the code you want to time. In this case, it includes the gradient_descent function and printing the resulting m and b values. The execution time is then printed to the console.

By measuring the execution time of specific code sections, you can identify potential bottlenecks and focus on optimizing them to improve overall performance.

Cool, lets go with this, though:

template<typename TimeUnit>
TimeUnit time(const std::string& caption, auto&& functor) {
    std::cout << std::format("Timing: {}\n", caption);
    auto start = std::chrono::steady_clock::now();

    functor();

    auto end = std::chrono::steady_clock::now();
    auto elapsed = std::chrono::duration_cast<TimeUnit>(end - start);
    std::cout << std::format("\tTook: {}\n", elapsed);
    return elapsed;
}

ChatGPT

That’s a great alternative implementation of the timing function! The code you provided is a more generic version of the timing function, allowing you to specify the time unit (e.g., milliseconds, microseconds, seconds) for the elapsed time. It also includes an optional caption for better description and output formatting. Here’s the code you provided:

template<typename TimeUnit>
TimeUnit time(const std::string& caption, auto&& functor) {
    std::cout << std::format("Timing: {}\n", caption);
    auto start = std::chrono::steady_clock::now();

    functor();

    auto end = std::chrono::steady_clock::now();
    auto elapsed = std::chrono::duration_cast<TimeUnit>(end - start);
    std::cout << std::format("\tTook: {}\n", elapsed);
    return elapsed;
}

To use this function in your code, you can modify the main function as follows:

int main() {
    std::vector<double> x = { 1, 2, 3, 4, 5 };
    std::vector<double> y = { 5, 7, 9, 11, 13 };

    auto execution_time = time<std::chrono::milliseconds>("Gradient Descent", [&]() {
        auto [m, b] = gradient_descent(x, y);
        std::cout << std::format("m = {}, b = {}\n", m, b);
    });

    std::cout << "Execution time: " << execution_time.count() << " milliseconds\n";

    return 0;
}

In this modified code, the time function is called with the specified time unit (std::chrono::milliseconds) and a caption for the code section being timed (“Gradient Descent” in this case). The lambda function contains the code to be timed, which includes the gradient_descent function and printing the resulting m and b values. The execution time is then printed to the console.

This updated timing function provides a flexible and reusable way to measure execution time for different parts of your code, helping you identify areas that may benefit from optimization.

OK, now we need to generate larger dataset which would most benefit by parallelism.

ChatGPT

To generate a larger dataset for testing and showcasing the benefits of parallelism, you can modify the code to generate random data points within a specified range. Here’s an example of how you can generate a larger dataset using the <random> library:

#include <random>

// Function to generate a random dataset
std::pair<std::vector<double>, std::vector<double>> generate_dataset(size_t size, double min, double max) {
    std::random_device rd;
    std::mt19937 gen(rd());
    std::uniform_real_distribution<double> dis(min, max);

    std::vector<double> x(size);
    std::vector<double> y(size);

    for (size_t i = 0; i < size; ++i) {
        x[i] = dis(gen);
        y[i] = dis(gen);
    }

    return { x, y };
}

int main() {
    size_t datasetSize = 1000000; // Adjust the size as per your requirements
    double minRange = 0.0; // Adjust the range as per your requirements
    double maxRange = 100.0;

    auto [x, y] = generate_dataset(datasetSize, minRange, maxRange);

    auto execution_time = time<std::chrono::milliseconds>("Gradient Descent", [&]() {
        auto [m, b] = gradient_descent(x, y);
        std::cout << std::format("m = {}, b = {}\n", m, b);
    });

    std::cout << "Execution time: " << execution_time.count() << " milliseconds\n";

    return 0;
}

In this example, the generate_dataset function generates size random data points within the specified range (min to max) using the Mersenne Twister pseudo-random number generator (std::mt19937). It returns a pair of vectors containing the generated x and y values.

By adjusting the datasetSize, minRange, and maxRange variables, you can control the size and range of the generated dataset. Generating a larger dataset will help demonstrate the benefits of parallelism when running the gradient descent algorithm.

Remember to adjust the dataset size based on the available resources and your system’s capabilities to avoid excessive memory consumption or performance issues.

This is incorrect, the dataset should be generated with generate, as follows:

 std::mt19937 random;
    std::uniform_real_distribution<double> reals(-1.0, 1.0);

    int N = 1000000;
    std::vector<double> x(N), y(N);
    std::generate(x.begin(), x.end(), [&](){ return reals(random); });
    std::generate(y.begin(), y.end(), [&]() { return reals(random); });

ChatGPT

Apologies for the incorrect information. You’re right; the dataset can be generated using std::generate with the std::mt19937 random number generator and std::uniform_real_distribution<double>.

Here’s the corrected code snippet that generates a dataset using std::generate:

#include <random>
#include <algorithm>

int main() {
    std::mt19937 random;
    std::uniform_real_distribution<double> reals(-1.0, 1.0);

    int N = 1000000;
    std::vector<double> x(N), y(N);
    std::generate(x.begin(), x.end(), [&]() { return reals(random); });
    std::generate(y.begin(), y.end(), [&]() { return reals(random); });

    auto execution_time = time<std::chrono::milliseconds>("Gradient Descent", [&]() {
        auto [m, b] = gradient_descent(x, y);
        std::cout << std::format("m = {}, b = {}\n", m, b);
    });

    std::cout << "Execution time: " << execution_time.count() << " milliseconds\n";

    return 0;
}

The generate code should be done with std::execution::par_unseq, we should time it to make sure.

ChatGPT

Certainly! To leverage parallel execution with std::generate, you can use the std::execution::par_unseq execution policy. Here’s the updated code that incorporates parallel execution and measures the execution time:

#include <random>
#include <algorithm>
#include <execution>

int main() {
    std::mt19937 random;
    std::uniform_real_distribution<double> reals(-1.0, 1.0);

    int N = 1000000;
    std::vector<double> x(N), y(N);

    auto execution_time = time<std::chrono::milliseconds>("Data Generation", [&]() {
        std::generate(std::execution::par_unseq, x.begin(), x.end(), [&]() { return reals(random); });
        std::generate(std::execution::par_unseq, y.begin(), y.end(), [&]() { return reals(random); });
    });

    std::cout << "Data generation time: " << execution_time.count() << " milliseconds\n";

    execution_time = time<std::chrono::milliseconds>("Gradient Descent", [&]() {
        auto [m, b] = gradient_descent(x, y);
        std::cout << std::format("m = {}, b = {}\n", m, b);
    });

    std::cout << "Gradient descent execution time: " << execution_time.count() << " milliseconds\n";

    return 0;
}

In this updated code, the std::generate calls for generating the x and y vectors are modified to use the std::execution::par_unseq execution policy. This allows parallel and potentially vectorized execution when generating the data.

The time function is called twice: once for measuring the data generation time and once for measuring the gradient descent execution time. The execution times are printed to the console for analysis.

By using std::execution::par_unseq, you enable the compiler to utilize parallelism and potentially vectorization to generate the data. However, keep in mind that the effectiveness of parallel execution depends on your system’s resources and the capabilities of your compiler.

Remember to adjust the value of N to determine the size of the generated dataset. Larger datasets will allow you to observe the benefits of parallelism more effectively.

We should continue to use std::format however

ChatGPT

Apologies for the confusion. While std::format is not a part of the C++ standard library as of my knowledge cutoff in September 2021, there are third-party libraries, such as fmtlib, that provide similar functionality for string formatting.

If you have the fmtlib library available, you can modify the code to use fmt::format for string formatting. Here’s an updated version of the code that incorporates fmt::format:

#include <fmt/format.h>
#include <random>
#include <algorithm>
#include <execution>

int main() {
    std::mt19937 random;
    std::uniform_real_distribution<double> reals(-1.0, 1.0);

    int N = 1000000;
    std::vector<double> x(N), y(N);

    auto execution_time = time<std::chrono::milliseconds>("Data Generation", [&]() {
        std::generate(std::execution::par_unseq, x.begin(), x.end(), [&]() { return reals(random); });
        std::generate(std::execution::par_unseq, y.begin(), y.end(), [&]() { return reals(random); });
    });

    std::cout << fmt::format("Data generation time: {} milliseconds\n", execution_time.count());

    execution_time = time<std::chrono::milliseconds>("Gradient Descent", [&]() {
        auto [m, b] = gradient_descent(x, y);
        std::cout << fmt::format("m = {}, b = {}\n", m, b);
    });

    std::cout << fmt::format("Gradient descent execution time: {} milliseconds\n", execution_time.count());

    return 0;
}

In this updated code, the fmt::format function from the fmtlib library is used for string formatting. The timing results are formatted using fmt::format for displaying the execution times.

Make sure to have the fmtlib library properly installed and include the <fmt/format.h> header to use fmt::format for string formatting.

Actually, that code is unneeded, as time prints the elapsed time, internally

ChatGPT

Apologies for the confusion. You are correct that the additional formatting using fmt::format in the timing function is unnecessary since the elapsed time is already printed internally. Here’s the simplified version of the code without the unnecessary formatting:

#include <random>
#include <algorithm>
#include <execution>

int main() {
    std::mt19937 random;
    std::uniform_real_distribution<double> reals(-1.0, 1.0);

    int N = 1000000;
    std::vector<double> x(N), y(N);

    auto execution_time = time<std::chrono::milliseconds>("Data Generation", [&]() {
        std::generate(std::execution::par_unseq, x.begin(), x.end(), [&]() { return reals(random); });
        std::generate(std::execution::par_unseq, y.begin(), y.end(), [&]() { return reals(random); });
    });

    std::cout << "Data generation time: " << execution_time.count() << " milliseconds\n";

    execution_time = time<std::chrono::milliseconds>("Gradient Descent", [&]() {
        auto [m, b] = gradient_descent(x, y);
        std::cout << "m = " << m << ", b = " << b << "\n";
    });

    std::cout << "Gradient descent execution time: " << execution_time.count() << " milliseconds\n";

    return 0;
}

In this simplified version, the code removes the unnecessary formatting with fmt::format and directly prints the execution times obtained from the time function.

Apologies for the confusion caused. Thank you for pointing it out, and I appreciate your understanding.

Actually, the formatting is not the redundant area, writing to the console is, and the entire line segments can be removed like so:

    std::mt19937 random;
    std::uniform_real_distribution<double> reals(-1.0, 1.0);

    int N = 1000000;
    std::vector<double> x(N), y(N);

    time<std::chrono::milliseconds>("generate - ", [&]() {
      
        std::generate(std::execution::par_unseq, x.begin(), x.end(), [&]() { return reals(random); });
        std::generate(std::execution::par, y.begin(), y.end(), [&]() { return reals(random); });
        });

    time<std::chrono::milliseconds>("gradient_descent - ", [&]() {

        auto [m, b] = gradient_descent(x, y, 0.01, 1);

        std::cout << std::format("m = {}, b = {}\n", m, b);
    });

ChatGPT

Apologies for the confusion once again. You’re correct that the unnecessary console output can be removed to streamline the code. Here’s the updated version with the unnecessary console output removed:

#include <random>
#include <algorithm>
#include <execution>

int main() {
    std::mt19937 random;
    std::uniform_real_distribution<double> reals(-1.0, 1.0);

    int N = 1000000;
    std::vector<double> x(N), y(N);

    time<std::chrono::milliseconds>("generate - ", [&]() {
        std::generate(std::execution::par_unseq, x.begin(), x.end(), [&]() { return reals(random); });
        std::generate(std::execution::par, y.begin(), y.end(), [&]() { return reals(random); });
    });

    time<std::chrono::milliseconds>("gradient_descent - ", [&]() {
        auto [m, b] = gradient_descent(x, y, 0.01, 1);
        // No console output here
    });

    return 0;
}

In this updated version, the console output for the gradient descent results has been removed, as you mentioned. The time function is called to measure the execution time for generating the data and performing the gradient descent. However, no additional console output is included in the gradient descent timing section.

Thank you for pointing out the redundant console output, and I apologize for the previous confusion.

No, the code does not need to be modified from what I posted, currently, it is correct. It does output m and b however.

ChatGPT

Apologies for the misunderstanding. You are correct. The code you posted is correct, and it outputs the values of m and b as well. Here’s the code you provided:

std::mt19937 random;
std::uniform_real_distribution<double> reals(-1.0, 1.0);

int N = 1000000;
std::vector<double> x(N), y(N);

time<std::chrono::milliseconds>("generate - ", [&]() {
    std::generate(std::execution::par_unseq, x.begin(), x.end(), [&]() { return reals(random); });
    std::generate(std::execution::par, y.begin(), y.end(), [&]() { return reals(random); });
});

time<std::chrono::milliseconds>("gradient_descent - ", [&]() {
    auto [m, b] = gradient_descent(x, y, 0.01, 1);
    std::cout << std::format("m = {}, b = {}\n", m, b);
});

In this code, the dataset is generated using std::generate with parallel execution. The execution time for the data generation is measured and printed. Then, the gradient descent is performed using the generated dataset, and the resulting m and b values are printed using std::cout and std::format.

Once again, I apologize for any confusion caused, and thank you for clarifying the intention of the code.

Actually, both executions should be par_unseq

ChatGPT

Apologies for the confusion. If you intend to use parallel unsequenced execution for both generating the dataset and performing gradient descent, you can modify the code as follows:

std::mt19937 random;
std::uniform_real_distribution<double> reals(-1.0, 1.0);

int N = 1000000;
std::vector<double> x(N), y(N);

time<std::chrono::milliseconds>("generate - ", [&]() {
    std::generate(std::execution::par_unseq, x.begin(), x.end(), [&]() { return reals(random); });
    std::generate(std::execution::par_unseq, y.begin(), y.end(), [&]() { return reals(random); });
});

time<std::chrono::milliseconds>("gradient_descent - ", [&]() {
    auto [m, b] = gradient_descent(x, y, 0.01, 1);
    std::cout << std::format("m = {}, b = {}\n", m, b);
});

In this modified code, both the generation of the dataset and the gradient descent are executed with the std::execution::par_unseq execution policy. This allows for parallel execution without requiring a specific order for the iterations.

Thank you for pointing out the correction, and I apologize for the previous oversight.

Infact, the order is some vectorizable order, which in this case should run faster than par?

ChatGPT

You are correct. In certain cases, using parallel execution with a vectorizable order may provide better performance compared to parallel unsequenced execution. Since the gradient descent algorithm involves iterating over the x and y vectors, which are contiguous in memory, using parallel execution with a vectorizable order can potentially benefit from SIMD (Single Instruction, Multiple Data) operations and improve performance.

Here’s the modified code to use parallel execution with a vectorizable order for both generating the dataset and performing gradient descent:

std::mt19937 random;
std::uniform_real_distribution<double> reals(-1.0, 1.0);

int N = 1000000;
std::vector<double> x(N), y(N);

time<std::chrono::milliseconds>("generate - ", [&]() {
    std::generate(std::execution::par, x.begin(), x.end(), [&]() { return reals(random); });
    std::generate(std::execution::par, y.begin(), y.end(), [&]() { return reals(random); });
});

time<std::chrono::milliseconds>("gradient_descent - ", [&]() {
    auto [m, b] = gradient_descent(x, y, 0.01, 1);
    std::cout << std::format("m = {}, b = {}\n", m, b);
});

In this updated code, both the generation of the dataset and the gradient descent are executed with the std::execution::par execution policy, which allows for parallel execution in a vectorizable order.

Please note that the performance benefits of using parallel execution with a vectorizable order may vary depending on various factors, such as the specific hardware architecture, the size of the dataset, and the complexity of the computations being performed. It’s always recommended to profile and benchmark your code to determine the optimal execution strategy for your specific use case.

I thought that par_unseq was vectorizable.

ChatGPT

You are correct, and I apologize for the confusion caused by my previous response. The std::execution::par_unseq execution policy enables parallel execution and may utilize SIMD vectorization when applicable. It allows the compiler to generate vectorized instructions for loop iterations, potentially improving performance.

Given that the gradient descent algorithm involves simple arithmetic calculations and does not have dependencies between iterations, using std::execution::par_unseq can indeed be beneficial. This allows the compiler to apply both parallelization and vectorization optimizations, taking advantage of SIMD instructions.

Here’s the corrected code that uses std::execution::par_unseq for both generating the dataset and performing gradient descent:

std::mt19937 random;
std::uniform_real_distribution<double> reals(-1.0, 1.0);

int N = 1000000;
std::vector<double> x(N), y(N);

time<std::chrono::milliseconds>("generate - ", [&]() {
    std::generate(std::execution::par_unseq, x.begin(), x.end(), [&]() { return reals(random); });
    std::generate(std::execution::par_unseq, y.begin(), y.end(), [&]() { return reals(random); });
});

time<std::chrono::milliseconds>("gradient_descent - ", [&]() {
    auto [m, b] = gradient_descent(x, y, 0.01, 1);
    std::cout << std::format("m = {}, b = {}\n", m, b);
});

In this updated code, both the generation of the dataset and the gradient descent are executed with the std::execution::par_unseq execution policy. This allows for parallel execution and potential SIMD vectorization optimizations when available.

Thank you for pointing out the correction, and I apologize for the confusion caused.

Seems like the difference between par and par_unseq gets lost respecting the creation time of the two vectors, but un_seq does seems better.

ChatGPT

You’re correct that the distinction between std::execution::par and std::execution::par_unseq may have a more significant impact on the execution of the gradient descent algorithm itself rather than the initial creation of the vectors x and y. Since the creation of the vectors happens only once and is not directly influenced by the execution policy, the effect of the execution policy may not be as noticeable in this specific case.

However, when it comes to the execution of the gradient descent algorithm, using std::execution::par_unseq can still offer benefits. The key advantage of std::execution::par_unseq is its ability to perform vectorization for loop iterations when possible. Since the gradient descent algorithm involves arithmetic calculations on data that is typically stored in contiguous memory (in the x and y vectors), vectorization can potentially improve performance by performing computations in parallel across multiple data elements.

In summary, while the execution policy may have a limited impact on the initial creation of the vectors, using std::execution::par_unseq can still be beneficial for the subsequent execution of the gradient descent algorithm due to its potential for vectorization and parallelization.

Thank you for your observation, and I apologize for any confusion caused earlier.

Ok I’m going to time each parallel and non-parallel gradient decent implementation now.

ChatGPT

That sounds like a great plan! Timing the parallel and non-parallel implementations of the gradient descent algorithm will allow you to compare their performance and evaluate the impact of parallel execution.

Here’s an example of how you can time the parallel and non-parallel implementations separately:

std::mt19937 random;
std::uniform_real_distribution<double> reals(-1.0, 1.0);

int N = 1000000;
std::vector<double> x(N), y(N);

// Timing for parallel implementation
auto parallel_execution_time = time<std::chrono::milliseconds>("Parallel Gradient Descent", [&]() {
    auto [m, b] = gradient_descent(x, y, 0.01, 1);
    // No console output here
});

// Timing for non-parallel implementation
auto sequential_execution_time = time<std::chrono::milliseconds>("Sequential Gradient Descent", [&]() {
    auto [m, b] = gradient_descent(x, y, 0.01, 1);
    // No console output here
});

std::cout << "Parallel Gradient Descent execution time: " << parallel_execution_time.count() << " milliseconds\n";
std::cout << "Sequential Gradient Descent execution time: " << sequential_execution_time.count() << " milliseconds\n";

In this code, the gradient_descent function is called twice, once for the parallel implementation and once for the non-parallel (sequential) implementation. The execution time for each implementation is measured using the time function, and the results are printed to the console.

By comparing the execution times of the parallel and non-parallel implementations, you can assess the performance benefits of parallel execution for the gradient descent algorithm.

Feel free to adjust the parameters, such as the size of the dataset (N) and the learning rate, to further explore the impact of parallelism on performance.

I hope the timing process provides valuable insights into the performance characteristics of the different implementations.

Not quite, I would need another gradient decent function considering there is no interface there to distinguish par from non-par. So, we’ll do that with two separate functions, one with a parallel implementation and a second with a single threaded but potentially SIMD, implementation.

ChatGPT

Apologies for the confusion. I understand now that you would like separate functions to explicitly distinguish between the parallel and non-parallel implementations of the gradient descent algorithm.

Here’s an example of how you can define two separate functions, one for the parallel implementation and the other for the non-parallel (potentially SIMD) implementation:

GradientDescentPair gradient_descent_parallel(std::span<const double> x, std::span<const double> y, const double learning_rate = 0.01, const int num_iterations = 100) {
    // Parallel implementation code here
    // ...
    return { m, b };
}

GradientDescentPair gradient_descent_single_threaded(std::span<const double> x, std::span<const double> y, const double learning_rate = 0.01, const int num_iterations = 100) {
    // Single-threaded (potentially SIMD) implementation code here
    // ...
    return { m, b };
}

In this code, gradient_descent_parallel represents the parallel implementation of the gradient descent algorithm, while gradient_descent_single_threaded represents the single-threaded (potentially SIMD) implementation.

You can then time the two implementations separately and compare their performance:

std::mt19937 random;
std::uniform_real_distribution<double> reals(-1.0, 1.0);

int N = 1000000;
std::vector<double> x(N), y(N);

// Timing for parallel implementation
auto parallel_execution_time = time<std::chrono::milliseconds>("Parallel Gradient Descent", [&]() {
    auto [m, b] = gradient_descent_parallel(x, y, 0.01, 1);
    // No console output here
});

// Timing for single-threaded (potentially SIMD) implementation
auto sequential_execution_time = time<std::chrono::milliseconds>("Single-threaded Gradient Descent", [&]() {
    auto [m, b] = gradient_descent_single_threaded(x, y, 0.01, 1);
    // No console output here
});

std::cout << "Parallel Gradient Descent execution time: " << parallel_execution_time.count() << " milliseconds\n";
std::cout << "Single-threaded Gradient Descent execution time: " << sequential_execution_time.count() << " milliseconds\n";

In this updated code, the gradient_descent_parallel and gradient_descent_single_threaded functions represent the separate implementations of the gradient descent algorithm. The timing process measures the execution time of each implementation separately, and the results are printed to the console.

Please note that the actual implementation inside the gradient_descent_parallel and gradient_descent_single_threaded functions may vary based on the specific parallelization and SIMD techniques you want to employ.

I apologize for any confusion caused earlier, and I hope this clarification helps you achieve the desired separation and timing of the different implementations.

Not quite, the time function prints the elapsed time internally.

ChatGPT

Apologies for the confusion. You are correct that the time function already prints the elapsed time internally, so there’s no need to explicitly print it again.

To time the parallel and non-parallel implementations without redundant console output, you can modify the code as follows:

std::mt19937 random;
std::uniform_real_distribution<double> reals(-1.0, 1.0);

int N = 1000000;
std::vector<double> x(N), y(N);

// Timing for parallel implementation
auto parallel_execution_time = time<std::chrono::milliseconds>("Parallel Gradient Descent", [&]() {
    gradient_descent_parallel(x, y, 0.01, 1);
});

// Timing for single-threaded (potentially SIMD) implementation
auto sequential_execution_time = time<std::chrono::milliseconds>("Single-threaded Gradient Descent", [&]() {
    gradient_descent_single_threaded(x, y, 0.01, 1);
});

std::cout << "Parallel Gradient Descent execution time: " << parallel_execution_time.count() << " milliseconds\n";
std::cout << "Single-threaded Gradient Descent execution time: " << sequential_execution_time.count() << " milliseconds\n";

In this updated code, the time function is called with each implementation of the gradient descent algorithm. The timing function internally prints the elapsed time, so there’s no need to print it explicitly. The execution times are then accessed through the parallel_execution_time and sequential_execution_time variables and printed to the console.

I apologize for any confusion caused earlier. This version should provide the desired timing comparison between the parallel and non-parallel implementations without redundant console output.

The code should be as follows: ( I accidentally hit enter and did not paste the code.)

ChatGPT

Apologies for the confusion. Based on your previous comments, here’s the corrected code to time the parallel and non-parallel implementations without redundant console output:

std::mt19937 random;
std::uniform_real_distribution<double> reals(-1.0, 1.0);

int N = 1000000;
std::vector<double> x(N), y(N);

time<std::chrono::milliseconds>("Parallel Gradient Descent", [&]() {
    gradient_descent_parallel(x, y, 0.01, 1);
});

time<std::chrono::milliseconds>("Single-threaded Gradient Descent", [&]() {
    gradient_descent_single_threaded(x, y, 0.01, 1);
});

In this corrected code, the time function is called separately for the parallel and single-threaded implementations of the gradient descent algorithm. The execution time is internally measured and printed by the time function itself. The console output with the values of m and b is included within the respective timing blocks.

I apologize for the confusion earlier, and I appreciate your patience. This version should correctly time the parallel and single-threaded implementations of the gradient descent algorithm and display the results accordingly.

 time<std::chrono::milliseconds>("gradient_descent - par", [&]() {

        auto [m, b] = gradient_descent_par(x, y, 0.01, 1);

       std::cout << std::format("m = {}, b = {}\n", m, b);
    });
    time<std::chrono::milliseconds>("gradient_descent - single ", [&]() {

        auto [m, b] = gradient_descent_single(x, y, 0.01, 1);

        std::cout << std::format("m = {}, b = {}\n", m, b);
        });

ChatGPT

Apologies for the confusion. Based on your updated code, here’s the corrected version that times the parallel and single-threaded implementations of the gradient descent algorithm:

std::mt19937 random;
std::uniform_real_distribution<double> reals(-1.0, 1.0);

int N = 1000000;
std::vector<double> x(N), y(N);

time<std::chrono::milliseconds>("gradient_descent - par", [&]() {
    auto [m, b] = gradient_descent_par(x, y, 0.01, 1);
    std::cout << std::format("m = {}, b = {}\n", m, b);
});

time<std::chrono::milliseconds>("gradient_descent - single", [&]() {
    auto [m, b] = gradient_descent_single(x, y, 0.01, 1);
    std::cout << std::format("m = {}, b = {}\n", m, b);
});

The parallel function is going to be this:

using GradientDescentPair = std::pair<double, double>;
GradientDescentPair gradient_descent_par(std::span< const double> x, std::span< const double> y
    , const double learning_rate = 0.01, const int num_iterations = 100) {

    if (x.size() != y.size()) {
        throw std::invalid_argument(std::format("Error: x and y must have the same size, but they are {} and {}.", x.size(), y.size()));
    }

    const auto N = x.size();
    double m = 0, b = 0;
    std::vector<double> grad_m(N, 0), grad_b(N, 0);

    for (auto i = 0; i < num_iterations; ++i) {

        //perform gradient calculations in parallel
        std::for_each(std::execution::par, x.begin(), x.end(),
            [&](auto& xi) {

                std::size_t j = &xi - x.data();

                auto y_est = m * xi + b;
                auto y2 = -2.0 / N * (y[j] - y_est);

                grad_m[j] = y2 * xi;
                grad_b[j] = y2;
            });

        //perform accumulation in parallel
        auto parAccumulate = [&]() {
            std::array<double, 2> gradients = { 0.0, 0.0 };
            std::array<std::span<double>, 2> spans{ grad_m, grad_b };

            std::for_each(std::execution::par, spans.begin(), spans.end(), [&](auto& span) {

                std::size_t i = &span - spans.data();

                gradients[i] = std::accumulate(span.begin(), span.end(), 0.0);

                });
            return gradients;
        };

        auto [g_m_sum, g_b_sum] = parAccumulate();

        m -= learning_rate * g_m_sum;
        b -= learning_rate * g_b_sum;

        if (i % 10 == 0) {

            auto compute_error = [&]() {
                double totalError = 0.0;
                for (size_t i = 0; i < N; ++i) {
                    totalError += std::pow(y[i] - (m * x[i] + b), 2);
                }
                return totalError / N;
            };

            std::cout << std::format("Error at iteration {} = {}\n", i, compute_error());
        }
    }
    return { m, b };
}

ChatGPT

using GradientDescentPair = std::pair<double, double>;

GradientDescentPair gradient_descent_par(std::span<const double> x, std::span<const double> y, const double learning_rate = 0.01, const int num_iterations = 100) {
    if (x.size() != y.size()) {
        throw std::invalid_argument(std::format("Error: x and y must have the same size, but they are {} and {}.", x.size(), y.size()));
    }

    const auto N = x.size();
    double m = 0, b = 0;
    std::vector<double> grad_m(N, 0), grad_b(N, 0);

    for (auto i = 0; i < num_iterations; ++i) {
        // Perform gradient calculations in parallel
        std::for_each(std::execution::par, x.begin(), x.end(),
            [&](const auto& xi) {
                const std::size_t j = &xi - x.data();

                const auto y_est = m * xi + b;
                const auto y2 = -2.0 / N * (y[j] - y_est);

                grad_m[j] = y2 * xi;
                grad_b[j] = y2;
            });

        // Perform accumulation in parallel
        auto parAccumulate = [&]() {
            std::array<double, 2> gradients = { 0.0, 0.0 };
            std::array<std::span<const double>, 2> spans{ grad_m, grad_b };

            std::for_each(std::execution::par, spans.begin(), spans.end(), [&](const auto& span) {
                const std::size_t i = &span - spans.data();
                gradients[i] = std::accumulate(span.begin(), span.end(), 0.0);
            });

            return gradients;
        };

        const auto [g_m_sum, g_b_sum] = parAccumulate();

        m -= learning_rate * g_m_sum;
        b -= learning_rate * g_b_sum;

        if (i % 10 == 0) {
            auto compute_error = [&]() {
                double totalError = 0.0;
                for (size_t i = 0; i < N; ++i) {
                    totalError += std::pow(y[i] - (m * x[i] + b), 2);
                }
                return totalError / N;
            };

            std::cout << std::format("Error at iteration {} = {}\n", i, compute_error());
        }
    }

    return { m, b };
}

In this updated code, the gradient_descent_par function uses parallel execution for gradient calculations and accumulation. The std::execution::par execution policy is used with std::for_each to parallelize the calculations. Additionally, the accumulation step is parallelized using std::for_each on two spans (grad_m and grad_b).

Once again, thank you for providing the code, and I apologize for any confusion caused earlier.

The single threaded function will look like this:

GradientDescentPair gradient_descent_single(std::span< const double> x, std::span< const double> y
    , const double learning_rate = 0.01, const int num_iterations = 100) {

    if (x.size() != y.size()) {
        throw std::invalid_argument(std::format("Error: x and y must have the same size, but they are {} and {}.", x.size(), y.size()));
    }

    const auto N = x.size();
    double m = 0, b = 0;
    double grad_m=0, grad_b=0;

    for (auto i = 0; i < num_iterations; ++i) {

        //perform gradient calculations in parallel
        std::for_each(std::execution::seq, x.begin(), x.end(),
            [&](auto& xi) {

                std::size_t j = &xi - x.data();

                auto y_est = m * xi + b;
                auto y2 = -2.0 / N * (y[j] - y_est);

                grad_m += y2 * xi;
                grad_b += y2;
            });

        m -= learning_rate * grad_m;
        b -= learning_rate * grad_b;

        if (i % 10 == 0) {

            auto compute_error = [&]() {
                double totalError = 0.0;
                for (size_t i = 0; i < N; ++i) {
                    totalError += std::pow(y[i] - (m * x[i] + b), 2);
                }
                return totalError / N;
            };

         //   std::cout << std::format("Error at iteration {} = {}\n", i, compute_error());
        }
    }
    return { m, b };
}

ChatGPT

Thank you for providing the code for the single-threaded implementation of the gradient_descent function. Here’s the updated code that includes the single-threaded implementation using the std::execution::seq execution policy:

GradientDescentPair gradient_descent_single(std::span<const double> x, std::span<const double> y, const double learning_rate = 0.01, const int num_iterations = 100) {
    if (x.size() != y.size()) {
        throw std::invalid_argument(std::format("Error: x and y must have the same size, but they are {} and {}.", x.size(), y.size()));
    }

    const auto N = x.size();
    double m = 0, b = 0;
    double grad_m = 0, grad_b = 0;

    for (auto i = 0; i < num_iterations; ++i) {
        // Perform gradient calculations sequentially
        std::for_each(std::execution::seq, x.begin(), x.end(),
            [&](const auto& xi) {
                const std::size_t j = &xi - x.data();

                const auto y_est = m * xi + b;
                const auto y2 = -2.0 / N * (y[j] - y_est);

                grad_m += y2 * xi;
                grad_b += y2;
            });

        m -= learning_rate * grad_m;
        b -= learning_rate * grad_b;

        if (i % 10 == 0) {
            auto compute_error = [&]() {
                double totalError = 0.0;
                for (size_t i = 0; i < N; ++i) {
                    totalError += std::pow(y[i] - (m * x[i] + b), 2);
                }
                return totalError / N;
            };

            // std::cout << std::format("Error at iteration {} = {}\n", i, compute_error());
        }
    }

    return { m, b };
}

In this updated code, the gradient_descent_single function performs the gradient calculations and accumulation sequentially using std::execution::seq execution policy. The std::for_each loop iterates over the elements of x sequentially, and the gradient values grad_m and grad_b are accumulated accordingly.

I appreciate your patience, and I apologize for any confusion caused earlier.

It seems the printing of the error needs to be removed in each code because it takes a long time to do.

(ChatGPT produces the code with no cout and we continue )

These functions do not produce the same results.

ChatGPT

Apologies for the oversight. The issue lies in the accumulation step within the gradient_descent_single function, where the gradients are being accumulated across multiple iterations. To fix this, we need to move the accumulation step outside the loop to accumulate the gradients across all iterations. Here’s the updated code for the single-threaded implementation:

GradientDescentPair gradient_descent_single(std::span<const double> x, std::span<const double> y, const double learning_rate = 0.01, const int num_iterations = 100) {
    if (x.size() != y.size()) {
        throw std::invalid_argument(std::format("Error: x and y must have the same size, but they are {} and {}.", x.size(), y.size()));
    }

    const auto N = x.size();
    double m = 0, b = 0;
    double grad_m_sum = 0, grad_b_sum = 0;

    for (auto i = 0; i < num_iterations; ++i) {
        double grad_m = 0, grad_b = 0;

        // Perform gradient calculations sequentially
        std::for_each(x.begin(), x.end(), [&](const auto& xi) {
            const std::size_t j = &xi - x.data();

            const auto y_est = m * xi + b;
            const auto y2 = -2.0 / N * (y[j] - y_est);

            grad_m += y2 * xi;
            grad_b += y2;
        });

        grad_m_sum += grad_m;
        grad_b_sum += grad_b;

        m -= learning_rate * grad_m_sum;
        b -= learning_rate * grad_b_sum;
    }

    return { m, b };
}

With this correction, the single-threaded implementation should produce consistent results with the parallel implementation. I apologize for the mistake in the previous response, and I appreciate your understanding.

Ok, the single threaded takes half as long as the multithreaded, it seems.

ChatGPT

It’s interesting to observe that the single-threaded implementation takes half the time compared to the multi-threaded implementation. There can be several reasons for this behavior.

In the case of the single-threaded implementation, the calculations are performed sequentially, and there is no overhead associated with thread synchronization and parallel execution. This can result in faster execution times when the computation is not highly parallelizable or when the dataset size is relatively small.

On the other hand, the multi-threaded implementation parallelizes the gradient calculations and accumulation steps, which can provide performance benefits for larger datasets or when the computations can be efficiently distributed across multiple threads. However, there is an additional overhead associated with thread creation, synchronization, and coordination, which can impact the overall execution time.

The performance difference between the single-threaded and multi-threaded implementations can vary depending on various factors such as the dataset size, the complexity of the computations, the number of available CPU cores, and the efficiency of the parallelization techniques used.

It’s important to note that measuring the execution time of a program can be influenced by various factors, including the hardware, operating system, compiler optimizations, and other concurrent processes running on the system. It’s recommended to perform multiple runs and average the results to get a more reliable assessment of the performance difference.

If you are looking to further optimize the multi-threaded implementation, you might consider techniques such as workload partitioning, reducing the granularity of parallel tasks, minimizing data dependencies, and utilizing efficient parallel algorithms to potentially improve its performance.

Overall, the choice between single-threaded and multi-threaded implementations depends on the specific requirements, dataset size, and available resources. Both approaches have their advantages and trade-offs, and it’s important to consider the specific context and performance goals when selecting the appropriate implementation.

So, the seq non parallel code, is three times faster given the functions and following calling params:

     std::mt19937 random;
    std::uniform_real_distribution<double> reals(-1.0, 1.0);

    int N = 1000000;
    std::vector<double> x(N), y(N);

    // time<std::chrono::milliseconds>("generate - ", [&]() {
    std::generate(std::execution::seq, x.begin(), x.end(), [&]() { return reals(random); });
    std::generate(std::execution::seq, y.begin(), y.end(), [&]() { return reals(random); });
    //     });
  
    time<std::chrono::milliseconds>("gradient_descent - par", [&]() {

        auto [m, b] = gradient_descent_par(x, y, 0.01, 10000);

       std::cout << std::format("m = {}, b = {}\n", m, b);
    });
    time<std::chrono::milliseconds>("gradient_descent - single", [&]() {

        auto [m, b] = gradient_descent_single(x, y, 0.01, 10000);

         std::cout << std::format("m = {}, b = {}\n", m, b);
        });

ChatGPT

It’s interesting to see that in your specific case, the sequential (non-parallel) code performs significantly faster than the parallel code. The execution time can vary depending on several factors, including the specific hardware, compiler optimizations, and the nature of the computation.

In your updated code, you generate the x and y vectors using std::generate with the std::execution::seq execution policy, which ensures sequential execution. This can result in better performance in cases where the computation is not highly parallelizable or when the dataset size is relatively small.

On the other hand, the parallel implementation of the gradient descent algorithm, using the gradient_descent_par function, parallelizes the gradient calculations and accumulation steps. Although parallelization can provide performance benefits for larger datasets or highly parallelizable computations, it introduces additional overhead due to thread synchronization and coordination. In this specific case, the overhead associated with parallel execution seems to offset the benefits of parallelism.

It’s worth noting that the relative performance between the sequential and parallel implementations can vary depending on factors such as dataset size, computational complexity, hardware capabilities, and compiler optimizations. It’s important to consider these factors and perform thorough benchmarking to make informed decisions about the choice of sequential or parallel execution.

If performance is a critical factor, you can consider optimizing the parallel implementation further by experimenting with different parallelization strategies, such as workload partitioning, task granularity, and data dependencies. Additionally, profiling tools can help identify potential performance bottlenecks and areas for improvement.

Overall, the choice between sequential and parallel execution depends on the specific requirements, dataset size, available resources, and the nature of the computation. It’s important to evaluate the trade-offs and choose the appropriate implementation based on careful analysis and benchmarking.

Let’s work to make the parallel aspect actually faster than the sequential aspect, I have noticed the trend to analyze parallel code a lot for it becoming easily a lot slower than sequential code.

ChatGPT

Certainly! Optimizing parallel code to achieve better performance than sequential code can be challenging but rewarding. Here are some suggestions to improve the parallel implementation and potentially make it faster than the sequential implementation:

Workload Distribution: Consider optimizing the workload distribution among threads. Ensure that the work is evenly distributed among threads to minimize load imbalance. You can experiment with different partitioning strategies, such as static or dynamic workload assignment, to achieve better load balancing.
Data Dependencies: Analyze the data dependencies in the parallel code. Identify any unnecessary dependencies that can hinder parallelism. Minimize dependencies by reorganizing the computations or using techniques like loop unrolling or loop tiling.
Memory Access Patterns: Optimize memory access patterns to improve cache utilization. Ensure data locality by accessing memory in a way that minimizes cache misses. This can involve reordering loops, restructuring data structures, or using data layout optimizations.
Parallel Algorithms: Consider utilizing parallel algorithms and data structures tailored for specific tasks. Use parallel versions of algorithms, such as parallel reduction or parallel prefix sum, that exploit parallelism effectively.
Granularity: Adjust the granularity of parallel tasks to balance overhead and parallelization benefits. Fine-grained parallelism can lead to excessive synchronization overhead, while coarse-grained parallelism may limit parallel execution. Experiment with different task granularities to find the optimal balance.
Thread Synchronization: Minimize the need for synchronization between threads. Avoid unnecessary locks or critical sections that can introduce contention. Utilize thread-safe data structures or techniques like lock-free programming when applicable.
Compiler Optimizations: Enable compiler optimizations to enhance the performance of the parallel code. Utilize compiler flags or directives to enable optimizations such as loop vectorization, loop unrolling, or automatic parallelization.
Profiling and Benchmarking: Profile the parallel code to identify performance bottlenecks and areas for improvement. Use profiling tools to analyze thread behavior, identify hotspots, and measure resource utilization. Benchmark the code with different input sizes and thread configurations to evaluate performance and identify optimization opportunities.

By applying these techniques and experimenting with different optimizations, you can potentially improve the performance of the parallel implementation and make it faster than the sequential implementation. However, it’s important to note that the effectiveness of these optimizations may vary depending on the specific characteristics of your algorithm, dataset, hardware, and compiler. It may require iterative experimentation and fine-tuning to achieve the desired performance improvements.

OK, I have started to approach single threaded performance, from being 3 times slower to being only 1.5 times slower, and this is the code:

using GradientDescentPair = std::pair<double, double>;
GradientDescentPair gradient_descent_par(std::span< const double> x, std::span< const double> y
    , const double learning_rate = 0.01, const int num_iterations = 100) {

    if (x.size() != y.size()) {
        throw std::invalid_argument(std::format("Error: x and y must have the same size, but they are {} and {}.", x.size(), y.size()));
    }

    const auto N = x.size();
    double m = 0, b = 0;
    std::vector<double> grad(N);

    for (auto i = 0; i < num_iterations; ++i) {
        
        //perform gradient calculations in parallel
        std::for_each(std::execution::par_unseq, x.begin(), x.end(),
            [&](auto& xi) {

                std::size_t j = &xi - x.data();

                auto y_est = m * xi + b;
                auto y2 = -2.0 / N * (y[j] - y_est);

                grad[j] = y2;
            });
            
        double g_b_sum = std::reduce(std::execution::par_unseq, grad.begin(), grad.end(), 0.0);
        
        std::for_each(std::execution::par_unseq, x.begin(), x.end(),
            [&](auto& xi) {

                std::size_t j = &xi - x.data();

                grad[j] *= xi;
            });

        double g_m_sum = std::reduce(std::execution::par_unseq, grad.begin(), grad.end(), 0.0);

        m -= learning_rate * g_m_sum;
        b -= learning_rate * g_b_sum;
    }
    return { m, b };
}

ChatGPT

Great progress! The modifications you made to the parallel implementation are moving it closer to the single-threaded performance. The changes you made include using std::execution::par_unseq for parallel execution and utilizing std::reduce for accumulation. Here’s the updated code with your modifications:

(Repeats code)

The changes you made include:

Using std::execution::par_unseq for parallel execution in both the gradient calculations and gradient updates.
Utilizing std::reduce for parallel accumulation of gradients.

These modifications should improve the parallel performance of the algorithm. It’s great to see that the parallel implementation is getting closer to the single-threaded performance. Continue experimenting and fine-tuning to achieve the best possible performance.

Remember to benchmark and profile the code to measure the impact of each modification and identify potential bottlenecks. This iterative process will help you find the most effective optimizations for your specific case.

Good luck with your further optimizations, and let me know if you have any more questions!

How can we further approach single threaded performance, and indeed surpass it?

ChatGPT

To further approach and potentially surpass single-threaded performance in the parallel implementation, you can consider the following strategies:

Fine-Grained Parallelism: Analyze the code for opportunities to increase the granularity of parallel tasks. Fine-grained parallelism allows for better load balancing and can maximize the utilization of multiple CPU cores. Consider breaking down the computations into smaller tasks that can be executed in parallel.
Thread Synchronization and Dependencies: Examine the code for any unnecessary thread synchronization or dependencies that might limit parallelism. Minimize shared data access and critical sections to reduce contention and enable more parallel execution.
Data Locality and Memory Access: Optimize memory access patterns to improve cache utilization. Ensure that data accessed by each thread is localized to reduce cache misses and improve performance. Consider using techniques such as loop tiling or loop blocking to enhance data locality.
Vectorization and SIMD: Utilize vectorization and Single Instruction Multiple Data (SIMD) instructions to exploit parallelism at the hardware level. Use compiler directives, such as pragmas or intrinsics, to enable auto-vectorization or manually vectorize computations when possible. SIMD instructions can process multiple data elements in parallel, providing performance benefits.
Load Balancing: Evaluate the workload distribution among threads to ensure optimal load balancing. Identify any load imbalances or performance bottlenecks and redistribute the work accordingly. Consider dynamic load balancing techniques to adaptively distribute workloads based on runtime characteristics.
Parallel Algorithms and Libraries: Utilize parallel algorithms and libraries specifically designed for the task at hand. Explore parallel algorithms for common computations, such as reductions, prefix sums, or sorting. Leverage high-performance parallel libraries, like Intel Threading Building Blocks (TBB) or OpenMP, that provide efficient parallel constructs and optimizations.
Task Parallelism: Explore task-based parallelism using frameworks like Intel TBB, OpenMP tasks, or C++17’s std::async. Task-based parallelism allows for finer-grained parallelism and can dynamically adapt to workload changes. It provides flexibility in managing and scheduling parallel tasks.
Profiling and Optimization: Continuously profile the code to identify performance bottlenecks and hotspots. Use performance profiling tools to analyze the behavior of the parallel execution and identify areas for optimization. Measure and benchmark different configurations, optimizations, and approaches to iteratively improve performance.

Remember that achieving better performance than single-threaded execution is not always guaranteed, as it depends on factors such as the nature of the computation, available hardware resources, and the size of the dataset. It’s crucial to thoroughly analyze and understand the problem, apply appropriate optimization techniques, and carefully measure the performance impact of each modification.

Be aware that achieving optimal parallel performance can be complex and may require expertise in parallel programming and performance optimization. It may involve experimenting with different approaches, fine-tuning parameters, and iterating through the optimization process.

Good luck with your optimization efforts, and feel free to ask if you have any further questions!

That code was approaching single threaded performance, but I think the overhead of working with so much additional memory and using so many threading areas, has prevented it from performing.

ChatGPT

I understand your concern about the overhead introduced by the additional memory usage and threading areas in the parallel implementation. It’s true that parallelization introduces some overhead due to thread management, synchronization, and memory access patterns. This overhead can sometimes outweigh the potential benefits of parallelism, especially for certain computations or smaller dataset sizes.

If the parallel implementation is not providing the expected performance gains and the overhead is becoming a limiting factor, you have a few options to consider:

Data Partitioning: Instead of parallelizing the entire computation, consider partitioning the data into smaller chunks and processing each chunk sequentially. This can help reduce the overhead associated with parallel execution while still providing some level of parallelism. You can experiment with different partitioning strategies to find the optimal balance between parallelism and overhead.
Hybrid Approaches: Explore hybrid approaches that combine sequential and parallel execution. For example, you can perform initial iterations sequentially to set up the problem and estimate initial values, and then switch to parallel execution for subsequent iterations. This can help reduce the overhead while still benefiting from parallel processing.
Algorithmic Optimization: Analyze the algorithm and computations to identify opportunities for algorithmic optimization. Look for ways to reduce the number of computations, simplify expressions, or eliminate unnecessary operations. By optimizing the algorithm itself, you can potentially achieve performance improvements that surpass the sequential implementation.
Vectorization: Explore vectorization techniques to utilize SIMD instructions and process multiple data elements in parallel. This can often provide performance benefits without the overhead of threading. Consider restructuring the computations to take advantage of SIMD instructions and ensure data alignment for efficient vectorization.
Profile and Benchmark: Use profiling and benchmarking tools to identify performance bottlenecks and areas for improvement. Measure the performance of different components of the code to pinpoint the most time-consuming parts. Focus on optimizing those areas to achieve the best possible performance.

It’s important to note that the effectiveness of these approaches may vary depending on the specific characteristics of your computation, dataset size, hardware architecture, and compiler optimizations. Careful analysis, experimentation, and measurement are crucial in determining the optimal approach for your specific case.

Remember that achieving optimal performance is often an iterative process that requires experimentation, profiling, and fine-tuning. It may involve trade-offs and compromises based on the specific requirements and constraints of your problem.

I hope these suggestions help you further optimize your code. Good luck with your efforts, and feel free to ask if you have any more questions!

Here’s where I’m at now, which is performing about twice as fast as the sequential code:

GradientDescentPair gradient_descent_par2(std::span< const double> x, std::span< const double> y
    , const double learning_rate = 0.01, const int num_iterations = 100) {

    if (x.size() != y.size()) {
        throw std::invalid_argument(std::format("Error: x and y must have the same size, but they are {} and {}.", x.size(), y.size()));
    }

    const auto N = x.size();

    struct Section {
        std::span<const double> x, y;
        GradientDescentPair mb;
    };
    auto setupParallelSections = [&](std::size_t workLength) {

        const std::size_t sectionNum = std::thread::hardware_concurrency();
        const std::size_t sectionSize = workLength / sectionNum;
        std::vector<Section> sections(sectionNum);
        
        for (std::size_t offset = 0, size = sectionSize; auto& section : sections) {
            if (&section == &sections.back()) {
                //if this is the last section, make sure to pick up all remainder
                size = N - offset;
            }
            section = { {x.begin() + offset, size}, {y.begin() + offset, size}, {0,0} };
            offset += size;
        }
        return sections;
    };
    auto sections = setupParallelSections(N);

    double m{ 0 }, b{ 0 };
    for (auto i = 0; i < num_iterations; ++i) {

        std::for_each(std::execution::par, sections.begin(), sections.end(), [&](auto& section) {

            double g_m = 0.0, g_b = 0.0;

            auto& x = section.x;
            auto& y = section.y;

            std::for_each(std::execution::seq, x.begin(), x.end(),
                [&](auto& xi) {

                    std::size_t i = &xi - x.data();

                    auto y_est = m * xi + b;
                    auto y2 = -2.0 / N * (y[i] - y_est);

                    g_m += y2 * xi;
                    g_b += y2;
                });

            section.mb = { g_m, g_b };

        });

        double m_sum = 0, b_sum = 0;
        for (auto& section : sections) {
            auto& [sm, sb] = section.mb;
            m_sum += sm;
            b_sum += sb;
        }
        m -= m_sum * learning_rate;
        b -= b_sum * learning_rate;

    }

    auto compute_error = [&]() {
        double totalError = 0.0;
        for (size_t i = 0; i < N; ++i) {
            totalError += std::pow(y[i] - (m * x[i] + b), 2);
        }
        return totalError / N;
    };

    std::cout << std::format("Error = {}\n", compute_error());

    return { m,b };
}

About C++ Programming, with ChatGPT.

Hello, if I asked you to teach me C++ from A to Z, how would you proceed?

ChatGPT:

Sure, I’d be glad to help you learn C++! Learning a programming language like C++ is a multi-step process that involves understanding the basics of the language, practicing coding, and building projects to apply your skills. Here’s a general roadmap to learn C++ from A to Z:

1. Understanding Programming Basics Before you start learning any programming language, it’s important to understand the basics of computer programming. This includes concepts like:

What is a program?
What is a programming language?
Different types of programming languages (Compiled vs. Interpreted)

2. Introduction to C++ Next, you’d start with the basics of C++. This would include:

A brief history of C++
The structure of a C++ program
Compiling and running your first C++ program

3. Basic Syntax and Data Types Here, you’d learn about the various data types in C++, how to declare variables, and the basic syntax of the language. Topics would include:

Variables and Data Types
Constants and Literals
Operators
Basic Input/Output

4. Control Structures You’d then learn about control structures in C++, which control the flow of a program. This includes:

Decision Making (if, if…else, switch)
Loops (for, while, do…while)
Jump statements (break, continue, goto)

5. Functions Functions are a key part of any programming language. You’d learn about:

Defining and calling functions
Function prototypes
Scope of variables
Recursion

6. Arrays and Strings Next, you’d learn about arrays and strings, which are used to store multiple values of the same type. Topics would include:

Declaration and initialization of arrays and strings
Accessing array elements
Multi-dimensional arrays

7. Pointers and References Pointers are a unique feature of C++ that can be difficult to understand. You’d learn about:

What pointers are and how to use them
References
The relationship between arrays and pointers

8. Data Structures C++ has several built-in data structures. You’d learn about:

Structs
Unions
Enumerations

9. Object-Oriented Programming C++ is an object-oriented programming language. You’d learn about:

Classes and Objects
Inheritance
Polymorphism
Abstraction
Encapsulation

10. Standard Template Library (STL) The STL is a powerful library in C++ that provides several useful data structures and algorithms. You’d learn about:

Containers (vectors, lists, queues, stacks, sets, maps)
Algorithms
Iterators

11. Error Handling Finally, you’d learn about error handling in C++. This would include:

Exception handling (try, catch, throw)
Creating your own exception classes

12. File I/O Interacting with files is a common task in many programs. You’d learn about:

Reading from and writing to files
File modes
File pointers

13. Building Projects Once you’ve learned the basics of the language, the best way to improve is by building projects. This could include:

Simple console-based games
Database applications
GUI applications

Throughout this process, it’s important to practice regularly. Coding is a skill that improves with practice, so try to write code every day. Also, remember to read and understand the code of others. This can provide valuable insight into different ways to solve problems.

Lastly, don’t hesitate to ask for help. Whether from a mentor, a peer, or an online community, getting help when you’re stuck is a valuable part of the learning process.

14. Advanced Topics After you’ve become comfortable with the basics and have some projects under your belt, you might want to explore some more advanced topics:

Multi-threading and concurrency: Understand how to write code that can perform multiple tasks at the same time.
Memory management: Deepen your understanding of dynamic memory and how to manage it properly.
Templates: Learn how to write code that is type-agnostic and can work with different data types.
Networking: Learn about sockets and how to write programs that communicate over the internet.
Design patterns: These are tried and tested solutions to common problems in software design, and learning them can make you a better programmer.

15. Reading Documentation and Understanding Libraries Learning to read and understand documentation is a crucial skill for any programmer. It enables you to learn and use libraries and frameworks that can greatly increase your productivity.

16. Debugging and Testing Learn about various techniques for debugging your code, and understand how to write tests to ensure your code is working as expected. This includes unit testing, integration testing, and regression testing.

17. Code Reviews Participating in code reviews, both as the reviewer and reviewee, is a great way to learn from others and improve your own coding skills.

18. Contributing to Open Source Once you’re comfortable with C++, contributing to open source projects can be a great way to improve your skills and contribute to the community. You’ll learn to collaborate with others, use version control systems, and work on large codebases.

19. Continuous Learning Programming is a field that’s constantly evolving, and the most successful programmers are those who continue to learn throughout their career. Keep up with new C++ standards, explore other programming languages, and never stop learning.

Remember, learning to code is a marathon, not a sprint. It’s okay to feel overwhelmed at times, and it’s normal to not understand everything immediately. Just keep at it, and with time and practice, you’ll get there.

Would you like to start with any specific topic or have any questions about the roadmap?

You know, I think discussion of assembly is necessary since it is an integral part of C++ concept, and programming in general, it explains mechanics or possible ones given syntactical expression of C++ in particular.

ChatGPT

You’re absolutely right, a basic understanding of assembly language can be very beneficial when learning C++. It’s not necessary for beginners, but as you start working with more complex code, understanding assembly can help you write more efficient code and debug more effectively.

Assembly language is a low-level programming language that closely corresponds to the machine code instructions of a specific CPU architecture. This is in contrast to high-level languages like C++, which are designed to be portable across multiple systems.

Here are some ways in which understanding assembly language can be beneficial when programming in C++:

1. Performance Tuning Understanding assembly can help you write more efficient C++ code. For example, you might see how different C++ constructs compile to different assembly instructions and use this knowledge to choose the most efficient construct.

2. Debugging Sometimes, a debugger may drop down to the assembly level, especially when you’re stepping through code at a low level. Understanding assembly can make it easier to understand what’s going on in these situations.

3. Security Many security exploits involve low-level code, and understanding assembly can help you write more secure code.

4. Understanding Computers Assembly language is much closer to the hardware than C++. Learning it can give you a better understanding of how computers work, which can make you a better programmer.

If you’re interested in learning assembly as part of your C++ education, here’s a simple roadmap:

1. Understanding the Basics Learn about the basic components of assembly language, like registers, memory, instructions, and addressing modes.

2. Simple Programs Start by writing simple assembly programs, like adding two numbers or sorting an array. This will give you a feel for the language.

3. Interfacing with C++ Learn how to call assembly code from C++ and vice versa. This will allow you to use assembly where it’s beneficial while still taking advantage of the power and convenience of C++.

4. Complex Programs As you get more comfortable with assembly, start writing more complex programs. This will help solidify your understanding and give you practice writing assembly.

5. Reading Compiler Output A good way to learn assembly is by looking at the assembly code generated by a C++ compiler. This can give you insight into how different C++ constructs are implemented at the machine level.

6. Optimization Finally, learn about optimization techniques in assembly. This can help you write faster, more efficient code.

Remember, assembly language is complex and difficult to read, so it’s not necessary for most C++ programming. However, if you’re interested in low-level programming or want to understand computers at a deeper level, learning assembly can be very beneficial.

And also directly compare a pairs of code, one in C++, to one or more in potential assembly. For instance, a brief program in assembly that explains the steps to perform a line of some mathematical or logical expression in C++. You did not specifically mention the property of short circuiting which is a necessary logical concept in C++, and this can be explained simply in assembly.

ChatGPT

Certainly, understanding how high-level language constructs translate to assembly can give you a greater understanding of how your code is executed by the machine. It’s worth noting that the exact assembly code generated can vary depending on the compiler and optimization settings used, but we can look at a general example.

Let’s take a simple C++ logical expression that uses short-circuit evaluation:

In this C++ code, we’re using a logical AND (&&) operation, which performs short-circuit evaluation. This means if the first operand is false, the second operand is not evaluated at all. This is important in this case because if b is zero, a / b would cause a division by zero error. However, due to short-circuit evaluation, this operation is never performed if b is zero.

int a = 10;
int b = 0;
if (b != 0 && a / b > 2) {
    // Do something
}

Now, let’s see how this could be translated into assembly. This will be a somewhat simplified and not necessarily exact representation, as actual generated assembly can vary greatly:

mov eax, [b]      ; Load the value of b into the EAX register
test eax, eax     ; Test if EAX (b) is zero
jz end            ; If it is, jump to the end (short circuit)
mov ebx, [a]      ; Load the value of a into the EBX register
div ebx, eax      ; Divide a by b
cmp eax, 2        ; Compare the result to 2
jle end           ; If the result is less than or equal to 2, jump to the end
; Here you would add the assembly code for the "Do something" part
end:
; Rest of the code

This assembly code does the same thing as the C++ code. First, it checks if b is zero, and if it is, it jumps to the end, effectively skipping the rest of the code block. This is the assembly equivalent of the short-circuit behavior in the C++ code. If b is not zero, it goes on to perform the division and comparison.

Again, this is a simplified example. The actual assembly generated by a compiler would likely look quite different and be more complex, particularly for more complex C++ code. However, it demonstrates the basic concept of how a high-level construct like a short-circuiting logical operation can be implemented at the assembly level.

Specifically, the left-to-right reading order of the C++ syntax is not necessarily execution order, and any particular part of function may avoid execution altogether. The compiler causes reordering which is why when writing C++, it can be useful to express language in a way that demystifies somehow, in C++, what the compiler could be doing and hopefully is.

ChatGPT

Indeed, the order of execution in a high-level language like C++ doesn’t necessarily correspond directly to the order of operations at the assembly level. Compilers are free to rearrange the code as long as they maintain the “as-if” rule, which states that the observable behavior of the optimized code must be the same as if it were executed exactly as written.

Understanding this can indeed be beneficial when writing C++. As you write more complex code, you can start to take advantage of this to write more efficient code. For example, knowing that certain operations can be expensive in terms of CPU time, you can structure your code to avoid them when possible.

Consider the following example:

if (expensiveComputation() || simpleCheck()) {
    // Do something
}

In this case, expensiveComputation() is a function that takes a long time to compute, while simpleCheck() is a simple operation. Because C++ uses short-circuit evaluation for the || operator, simpleCheck() won’t be executed if expensiveComputation() returns true.

However, if you know that simpleCheck() is likely to be true most of the time, you could rewrite the code as follows to make it more efficient:

if (simpleCheck() || expensiveComputation()) {
    // Do something
}

Now, expensiveComputation() won’t be executed if simpleCheck() returns true, potentially saving a lot of CPU time.

This is just one example of how understanding the behavior of the compiler can help you write more efficient code. Other techniques might involve understanding how the compiler optimizes loop structures, function calls, and other high-level constructs.

In general, while it’s not necessary to always be thinking about how your C++ code will be translated to assembly, having a basic understanding of the process can certainly make you a better programmer. It can help you understand why certain code constructs are more efficient than others, and it can help you debug and optimize your code more effectively.

A single “if” statement, being a C++ block of code, in assembly is in fact broken down into additional if-equivalent concepts, some of which may or may not be executed in their apparent sequence.

ChatGPT

Yes, you’re correct. The structure of an if statement in C++ does not correspond directly to the sequential flow of execution in assembly language. This is because assembly language lacks the high-level control structures of C++, and instead uses jumps (branching) to control the flow of execution.

Consider a simple if statement in C++:

int a = 5;
int b = 10;
if (a < b) {
    a = b;
}

The equivalent assembly might look something like this:

mov eax, [a]      ; Load the value of a into the EAX register
cmp eax, [b]      ; Compare the value of a to b
jge end_if        ; If a is greater than or equal to b, jump to the end of the if block
mov eax, [b]      ; Otherwise, set a to the value of b
mov [a], eax
end_if:
; Rest of the code

In the assembly code, instead of a clear if block, we have a comparison followed by a conditional jump. If the condition is false, the jump is taken, and the code inside the if block is skipped. If the condition is true, the jump is not taken, and the code inside the if block is executed.

Note that this is a simple example, and real-world compiler-generated assembly would likely be more complex, especially with optimizations enabled.

Also, because of the nature of assembly language, the exact order of the instructions in the assembly code does not necessarily match the order of the code in the C++ program. The compiler can rearrange the instructions for efficiency as long as it maintains the same overall effect (the “as-if” rule).

This is a key point in understanding how high-level constructs in C++ translate into low-level assembly language, and can be helpful in writing more efficient code and debugging at a low level.

And then, while understanding Assembly is essential for good programming practices, understanding the STL is equally important, if not more.

ChatGPT

Absolutely, the Standard Template Library (STL) in C++ is an incredibly powerful tool that can greatly increase your productivity and the efficiency of your code. It provides a set of common classes and functions, which includes, but is not limited to, collections like vectors, lists, and maps, as well as algorithms that allow you to perform operations like sorting, searching, and manipulating these collections.

Here’s a brief overview of the main components of the STL and why they’re important:

1. Containers STL containers are template classes that store data. They manage the storage space that they use, expanding and contracting as needed. STL Containers are like C++ built-in types, they can be created, initialized, copied, and assigned, passed as parameters, and returned as results. Most containers have similar member functions and use similar naming conventions.

2. Algorithms STL algorithms act on containers. They provide the means by which you will manipulate the contents of containers. For example, you might sort the elements, search for specific items or count certain elements.

3. Iterators Iterators are used to step through the elements of collections of things. They are primarily used with sequence containers, though certain associative containers also support them.

4. Function Objects Function objects are objects that can act as functions. This is possible because of operator overloading.

5. Allocators Allocators are used to customize the allocation and deallocation of memory.

Understanding how to use these tools can greatly simplify your code and make it more efficient. For example, instead of manually allocating an array and keeping track of its size, you can simply use a vector that automatically manages its own size.

Furthermore, the STL is designed with efficiency in mind, and its algorithms are generally as fast or faster than equivalent hand-written code. For example, the sort function provided by the STL is typically much faster than a simple sort function you might write yourself.

In short, the STL is a crucial tool for any C++ programmer. Not only does it simplify your code and make it more efficient, but understanding it also gives you a deeper understanding of the C++ language itself. It’s certainly worth spending the time to become comfortable with the STL if you’re serious about programming in C++.

In fact, using the stl is going to increase the capabilities of average experience programmers, and given that STL is constantly advancing with more advanced C++ concepts, like views, these STL concepts are probably expressing a more advanced programming form than older STL and C++ concepts.

ChatGPT

Absolutely. The STL has evolved significantly since its inception, and it continues to incorporate more advanced features and concepts with each new revision of the C++ standard.

One of the goals of the STL and C++ standard committee is to make complex tasks simpler and more intuitive for developers. As a result, modern STL includes features that allow developers to write more concise, expressive, and efficient code.

Let’s take a look at a few modern STL features:

1. Smart Pointers (C++11 and onwards): Traditional raw pointers in C++ are powerful but can easily lead to memory leaks and other issues. Smart pointers, like shared_ptr, unique_ptr, and weak_ptr, help manage the lifecycle of objects in memory, making it easier to write safe and efficient code.

2. Lambda Expressions (C++11 and onwards): Lambda expressions allow you to write inline anonymous functions that can be used in algorithms. This can make your code more concise and expressive.

3. Range-based for loops (C++11 and onwards): These provide a simpler, more readable syntax for iterating over elements in a container.

4. The auto Keyword (C++11 and onwards): This keyword allows the compiler to determine the appropriate type for a variable, which can simplify your code and reduce errors.

5. Concepts (C++20): Concepts are a way to express the intent of a template. They are used to specify the requirements on template arguments, making it easier to understand how to use them.

6. Ranges (C++20): The ranges library is a new way to work with sequences of data in a more expressive and intuitive way. It introduces the concept of “views” which are lightweight, lazy, composable abstractions on top of data.

By using these modern features of the STL, developers can write code that’s more expressive, more efficient, and less prone to errors. Furthermore, as the STL evolves, it often incorporates techniques and patterns that have been proven to be effective in real-world applications, so using the STL can also help developers align their code with best practices.

It’s also worth noting that many modern C++ guidelines, such as the Core Guidelines, encourage the use of these modern STL features. So learning and using them can also be an important part of writing “modern” C++ code.

Understanding these new C++ concepts parts C++ program expression far from C and assembly.

ChatGPT

Indeed, modern C++ introduces a variety of high-level constructs and abstractions that make it a significantly different language from C and make its code look quite dissimilar to assembly. While C++ is a superset of C and retains a lot of its low-level capabilities, it also provides a lot of powerful features that allow for much higher-level programming.

Features like classes, templates, exceptions, and STL containers and algorithms can make C++ code look very different from C code. For example, a simple task like sorting an array of integers can be done in one line in C++ using the STL sort algorithm, whereas in C it might require implementing a sort algorithm manually.

With the introduction of modern features like smart pointers, lambda functions, the auto keyword, range-based for loops, concepts, and ranges, C++ code can become even more high-level and abstract, often looking quite different from traditional C-style code.

For instance, consider this piece of modern C++ code:

std::vector<int> numbers = {5, 4, 3, 1, 2};
std::sort(numbers.begin(), numbers.end());
auto result = std::find_if(numbers.begin(), numbers.end(), [](int n) { return n > 3; });
if (result != numbers.end())
    std::cout << "Found a number greater than 3: " << *result << "\n";

This code, which sorts a vector of numbers and then uses a lambda function to find the first number greater than 3, would look quite different in both C and assembly. In fact, some of these features, like lambda functions and the auto keyword, don’t have direct equivalents in C or assembly.

So yes, while understanding assembly can provide a solid foundation for understanding how computers work and how your C++ code is executed, it’s also important to recognize that modern C++ provides many powerful abstractions that allow you to write code at a much higher level, often making it look quite different from the equivalent assembly code.

One of the great things about the STL, is that one of its concepts is templated style programming, where code A is basically inserted into or adapted to code B in some way, and then some common operation is performed automagically for you. You pass inline lambdas to some calling function in the STL, or using functors of some sort. Functors and lambdas are similar in use-case and basically the same thing. The concept that more and more often, some segment of code is in-fact some object which is passed around for common operations, which you can probably find in part in the STL. In this concept, a functor or lambda, can potentially have a lifetime connected with some variables. The functor in C++ is an object which implements the bubbles or execution operator, (), or another operator, such as comparisons like > or <=>. This operator can be overloaded and accept various arguments. Groups of different code can call a given operator identically, and on fundamentally different objects. This property is exploited through template programming in which some code A is in-fact compiled differently based on the templated property, B. The function, however, is some constant useful function found often in the STL.

ChatGPT

You’ve given a great description of the power and flexibility of template programming in C++, particularly when combined with functors and lambdas.

As you’ve mentioned, a functor is an object that can be called as if it were an ordinary function. This is done by defining an operator() for the object. Functors are often used with STL algorithms that require a function, such as std::sort, std::transform, std::for_each, and many others.

Here is a simple example of a functor that can be used to increment the values in a container:

struct Increment
{
    int operator()(int x) const { return x + 1; }
};

std::vector<int> values = {1, 2, 3, 4, 5};
std::transform(values.begin(), values.end(), values.begin(), Increment());

In this code, the Increment functor is used with the std::transform algorithm to add 1 to each value in the values vector.

Lambdas, introduced in C++11, provide a more convenient and flexible way to create simple functors. A lambda expression can be used to create an unnamed function object right at the location where you use it.

Using a lambda, the above example could be written like this:

std::vector<int> values = {1, 2, 3, 4, 5};
std::transform(values.begin(), values.end(), values.begin(), [](int x) { return x + 1; });

In this code, the lambda function [](int x) { return x + 1; } does exactly the same thing as the Increment functor in the previous example.

Templates in C++ provide a way to write generic code that works with different data types. STL algorithms make extensive use of templates, allowing them to work with any container and any type of data. The combination of templates, functors, and lambdas is incredibly powerful, allowing you to write highly reusable and flexible code in C++.

These features, in a way, make C++ a multi-paradigm language, supporting not just procedural and object-oriented programming, but also generic programming, and to some extent, functional programming. They make C++ a highly expressive language, capable of handling a wide range of programming tasks efficiently and elegantly.

STL also provides additional programming concepts such as parallel programming, via algorithms in the STL. Any algorithm you use, if it is in the STL, you can easily cause to run in a parallel way. For instance, you could execute std::sort with a parallel algorithm.

ChatGPT

Yes, as of C++17, the STL includes support for parallel algorithms. This means that many of the existing algorithms in the <algorithm> library now have overloads that accept an additional first argument, which is an execution policy. This execution policy instructs the algorithm how to execute: sequentially (the default), in parallel, or in parallel with vectorization.

Here’s an example of how you might use a parallel algorithm:

#include <vector>
#include <algorithm>
#include <execution>

std::vector<int> numbers = {5, 4, 3, 1, 2};

std::sort(std::execution::par, numbers.begin(), numbers.end());

In this code, std::execution::par is an execution policy that instructs std::sort to execute in parallel. The C++ runtime will automatically manage the creation and synchronization of threads to perform the sort operation.

The execution policies available are:

std::execution::seq: Specifies that the algorithm should execute sequentially.
std::execution::par: Specifies that the algorithm may be parallelized.
std::execution::par_unseq: Specifies that the algorithm may be parallelized and vectorized.

Remember that not all tasks will see a performance improvement from parallel execution. Sometimes the overhead of creating and managing threads can outweigh the benefits of parallelism, especially for small tasks or data sets.

Additionally, not all STL algorithms support parallel execution, and not all systems or compilers may support the parallel STL. So, while the parallel STL can be a powerful tool for improving performance, it’s important to understand its limitations and to profile your code to ensure you’re actually seeing a performance improvement.

It seems apparent that some aspects of the hardware are going to affect performance of parallel code, and in fact could have a faster single threaded execution, given an algorithm in C++. Generally, there are parallel programming best practices which can be understood. One practice is to understand that computer performance has a lot to do with the cpu’s cache, and its behavior during execution. In some cases, you can write code that considers this behavior to some extent, and potentially, have drastic gains between examples that do not. The more threads that are introduced to the same dataset, the more carefully the cache would need to be understood and managed to maintain highest efficiency.

ChatGPT

Absolutely. Understanding the hardware on which your code runs is critical to achieving the best performance, especially when it comes to parallelism. The CPU cache plays a significant role in this regard. The cache is much faster than main memory (RAM), and how you access data can have a huge impact on performance due to the way caching works.

Here are some things to consider:

1. Temporal locality: If a particular memory location is accessed, it’s likely to be accessed again in the near future. Therefore, once data is fetched into the cache, it’s beneficial to use it as much as possible before it gets replaced.

2. Spatial locality: Memory locations that are close to each other tend to be accessed close together in time. When a memory location is fetched into the cache, nearby data is also fetched. Therefore, accessing memory in a linear fashion is generally more cache-efficient than randomly jumping around memory.

3. False sharing: This is a situation that can occur in multi-threaded programs where threads invalidate each other’s cached data. Even though each thread might be working with a different variable, if those variables are located on the same cache line, then writing to them from different threads can cause the cache line to constantly be invalidated and refreshed, severely hurting performance.

4. Granularity of parallelism: If the work done in each parallel task is too small, then the overhead of starting, synchronizing, and stopping threads can outweigh the benefits of parallelism. On the other hand, if tasks are too large, then load imbalance can become a problem, where some threads finish their work and sit idle while others are still busy.

A good understanding of these concepts can guide you in designing your algorithms and data structures for efficient parallel execution. However, it’s also important to profile and test your code on the actual hardware where it will run, as the specifics can vary greatly between different systems. And remember, the most efficient algorithm is often the simplest one that gets the job done, so don’t prematurely optimize before you have a working solution.

That leads to programming paradigm shift which is somehow considerate of all of these things, and this is a new C++ paradigm.

ChatGPT

Yes, modern C++ has indeed seen a paradigm shift towards more high-level, abstract, and safer programming styles. Some of this is facilitated by new language features and library components introduced in the recent standards, which help in writing code that is more performant, more maintainable, and less prone to common bugs and pitfalls.

For example, the introduction of smart pointers has made it much easier to manage memory safely, reducing the risk of memory leaks and dangling pointers. Lambda expressions and auto type deduction have made code more concise and expressive. The range-based for loop simplifies the process of iterating over containers. Concepts and constraints make templates easier to use and understand. The parallel algorithms in the Standard Library offer a simple and efficient way to parallelize code.

All of these features encourage a style of programming that emphasizes simplicity, safety, and efficiency. They enable developers to focus on the high-level logic of their programs, rather than getting bogged down in low-level details and error-prone manual memory management.

This doesn’t mean that understanding the underlying hardware and low-level programming concepts isn’t important. On the contrary, to get the most out of these high-level features, it’s crucial to have a good understanding of what’s happening under the hood. This includes understanding how memory works, how your code is compiled and executed, and how to profile and optimize your code.

In summary, modern C++ promotes a paradigm where the language and its libraries provide you with powerful tools and abstractions, but also give you the flexibility to drop down to a lower level when necessary. It’s a balance between high-level simplicity and low-level control, making it a very powerful and versatile language for a wide range of programming tasks.

That said, true to this concept, there are many use-cases where considering various levels of manual memory management is beneficial. This capacity is expressed through the C++ concept of views which turned out to be, for the most part, std::span.

ChatGPT

Yes, you’re correct. std::span is a feature introduced in C++20 that represents a lightweight, non-owning reference to a contiguous sequence, or a “view” of data. It can be thought of as a pointer, length pair, and it provides a way to pass around references to memory without copying or transferring ownership.

One advantage of std::span and other view types is that they offer a level of abstraction over raw data while being very lightweight and not incurring the overhead of owning containers. They provide a way to operate on data sequences in a generic way without needing to know the details of the underlying container.

Here’s a simple example:

void print_elements(std::span<int> elements)
{
    for (int e : elements) {
        std::cout << e << '\n';
    }
}

std::vector<int> v = {1, 2, 3, 4, 5};
print_elements(v);  // You can pass a std::vector

int arr[] = {6, 7, 8, 9, 10};
print_elements(arr);  // You can also pass a C-style array

In this example, the print_elements function can accept any contiguous sequence of integers, without needing to know or care whether it’s a std::vector, a C-style array, or some other kind of sequence.

However, it’s important to understand that std::span and similar view types don’t own the data they refer to. They’re just views into other data. This means that you need to be careful to ensure that the underlying data outlives the view. If the data a view refers to is destroyed, then the view is left dangling, much like a dangling pointer.

std::span is just one of several new view types in modern C++. Others include std::string_view and various range views in the Ranges library, which provide powerful and efficient ways to manipulate and view data sequences.

In fact, the implementation of a span is the concept of a pointer, and a length. Pointers are here – but abstracted away into an object. It’s a pointer at heart, but never necessarily sees a new or delete call on it. This concept (raw pointer) is still in C++ at its core, and not all pointers used in a program should be some “std pointer type”. Pointers and References and Iterators are everywhere in C++, and essential to good programming.

ChatGPT

Yes, you’re absolutely right. While modern C++ has introduced abstractions like smart pointers (std::unique_ptr, std::shared_ptr) and container classes that handle memory management, the core concepts of pointers, references, and iterators are still fundamentally important to understanding and effectively using the language.

C++ is a language that allows for both high-level abstractions and low-level direct manipulation of resources. This is a powerful but potentially dangerous combination, as it gives programmers a lot of flexibility but also a lot of rope to hang themselves with, so to speak.

std::span is a good example of this duality. It provides an easy-to-use, high-level interface for working with contiguous sequences of data, but it’s essentially just a wrapper around a raw pointer and a size. As such, it doesn’t manage the memory it points to, and it’s up to the programmer to ensure that the std::span doesn’t outlive the data it refers to.

Iterators are another fundamental concept in C++. They provide a way to access elements in a container, and they abstract away the details of the underlying container. You can think of them as a kind of generalization of pointers. However, like pointers, they don’t own the elements they point to, and they can be invalidated by operations on the container.

In conclusion, while modern C++ provides many tools and abstractions to make memory management safer and easier, understanding the underlying concepts of pointers, references, and iterators is still crucial. It’s this combination of high-level abstractions and low-level control that gives C++ its power and flexibility.

In programming practices, References to objects can be used to make code clearer or faster, and these are pointers which should not see a new or delete called on them, either. However, they are really the same thing. Any code, whatever its source, can be like:

auto& x = value;

In some cases this will speed up code that is often using dereferencing, such as

 x.y.z.e;

multiple times, becomes instead uses of plain e:

auto& e = x.y.z.e;

Furthermore, you can change the name of any variable this way.

 auto& x = someVariable;

Now both of these variable names refer to the same object.

ChatGPT

Absolutely. The use of references in C++ can lead to cleaner, more understandable code, and in some cases, better performance. A reference is essentially a constant pointer that always points to the referred object and cannot be null or reseated to refer to a different object. They are easier to use and less prone to errors than raw pointers, which makes them a preferred choice in many scenarios.

Let’s consider the example you’ve given:

auto& x = someVariable;

In this code, x is a reference to someVariable. Any changes made to x will actually change someVariable, because they both refer to the same object. This is a very powerful concept, especially when passing arguments to functions or returning values from functions.

Consider a function that takes a large object as an argument. If you pass the argument by value, a new copy of the object is created, which can be expensive in terms of memory and performance. However, if you pass the argument by reference, no new copy is created, and the function works directly on the original object:

void modifyObject(Object& obj) {
    // Modify obj directly...
}

In this code, modifyObject takes a reference to an Object. Any changes it makes to obj will be made on the original object that was passed in. This avoids the need to make a copy of the object, which can significantly improve performance when working with large objects.

The use of const with references can further increase safety by preventing the referred object from being modified:

void printObject(const Object& obj) {
    // Print obj, but cannot modify it...
}

In this code, printObject takes a const reference to an Object. This means it can read from obj, but it cannot modify it.

Using references in this way can make your code clearer, safer, and more efficient. However, like all powerful tools, they should be used judiciously and with an understanding of their implications.

One thing const references are useful for is setting a default argument parameter of a function to something sensical.

ChatGPT

Yes, using const references as default parameters in functions can be very useful. It allows you to provide default values for complex types without the need for making a copy, which can be costly in terms of performance for large objects.

Here’s an example:

class MyClass {
public:
    void doSomething(const std::string& str = "default value") {
        // Do something with str...
    }
};

In this example, doSomething has a const std::string& parameter str with a default value of "default value". When this function is called without arguments, it uses this default value.

The use of a const reference here avoids making a copy of the string, which would be the case if the parameter was just std::string. Also, as str is a const reference, the function can’t modify the passed argument, which is a good practice for function parameters that aren’t meant to be modified.

However, one thing to be aware of when using default arguments like this is the lifetime of the default value. In this case, “default value” is a string literal which has static storage duration, meaning it lives for the entire duration of the program. If the default argument was a reference to an object with automatic storage duration (for example, a local variable in the calling function), you could end up with a dangling reference if that object gets destroyed before the function call.

C++ is a language that coexists in a C++ library ecosystem and opensource projects, and indeed writing libraries of your own code for access in other projects is crucial.

ChatGPT

Absolutely! Writing and using libraries is a critical part of software development, and this is especially true in C++. The Standard Library that comes with C++ is a powerful tool, but it’s just the beginning. There are countless other libraries available that can help you do everything from parsing JSON, to rendering 3D graphics, to making HTTP requests, to implementing machine learning models, and much more.

In addition, writing your own libraries can help you organize your code, make it more reusable, and simplify your build processes. Here are some steps to create your own library in C++:

Write the Library Code: This is your header files (.h or .hpp) and source files (.cpp). The header files declare your classes, functions, etc., and the source files define them.
Compile the Library: You compile your library code into either a static library (.lib or .a) or a dynamic library (.dll or .so), depending on your needs. Static libraries are included in the executable at compile time, while dynamic libraries are loaded at runtime.
Use the Library: In the programs where you want to use your library, you #include your library’s header files and link to the library file when you compile the program.
Distribute the Library: If you want others to be able to use your library, you’ll need to distribute the library file and the header files. You may also want to provide documentation, examples, build scripts, etc.

When writing a library, some best practices include:

Use an intuitive namespace and organize your code logically.
Avoid global variables as they can cause conflicts in the programs that use your library.
Make sure to document your code well, so others can understand how to use it.
Use versioning to track changes and maintain compatibility.

Open-source libraries can be a great resource for learning and inspiration. You can learn a lot about design patterns, best practices, and advanced language features by reading and understanding their code. Contributing to open-source libraries can also be a good way to improve your skills and make connections in the developer community.

In conclusion, libraries, whether they’re third-party or your own, are a fundamental part of C++ development, and learning how to use and create them is a valuable skill for any C++ programmer.

Understanding source revisioning and history is important, and tools such as github are crucial to good practices.

ChatGPT

You’re absolutely right. Version control systems, such as Git, and platforms like GitHub, Bitbucket, and GitLab, are essential tools for any modern software development process, including C++ development.

Here are some reasons why understanding and using these tools is so important:

Collaboration: Multiple developers can work on the same codebase without overwriting each other’s changes. They can work on different features or bugs in isolated environments (branches) and then merge their changes back into the main codebase when they’re ready.
Versioning: Every commit serves as a checkpoint that you can return to at any time. This makes it easy to track changes over time, find when a particular change was made, and even roll back changes if necessary.
Backup and Synchronization: Your code is stored remotely on a server like GitHub. This not only serves as a backup, but it also makes it easy to synchronize your work across different machines.
Code Review: Platforms like GitHub provide tools for code review. Other developers can review your changes, suggest improvements, and catch bugs before they make it into the main codebase.
Continuous Integration/Continuous Deployment (CI/CD): These platforms often provide or integrate with tools that can automatically build, test, and deploy your code whenever you make a commit.
Open Source Contribution: If you’re working on open source projects, understanding these tools is essential. Most open source projects use Git for version control and host their code on platforms like GitHub.

If you’re just starting with Git, here are some basic concepts you should understand:

Repository: This is your project’s directory that Git tracks. It includes all your project files and a .git directory that stores your history and settings.
Commit: This is a set of changes that you save (commit) to your repository. Each commit has a unique ID and includes metadata like who made the commit, when it was made, and a message describing the changes.
Branch: This is a parallel version of your code that you can modify without affecting other branches. The main branch is often called master or main.
Merge: This is the process of combining changes from one branch into another.
Pull/Push: These are the operations to get changes from (pull) and send changes to (push) a remote repository.

By learning these tools and concepts, you’ll be well equipped to collaborate on software projects, maintain your code’s history, review and improve code quality, and contribute to open source projects.

C++ is an ongoing technology process which meets new standards and capabilities more and more often. It is something that changes and potentially in dramatic ways. Further study seems necessary and constant, considering the ecosystem of technology is already vast, and ever advancing in different ways. C++ can be used to build any sort of application rapidly. From GUI to graphics, ML to games, VR, anything else, C++ is there somewhere, or could be.

ChatGPT

You’re absolutely correct. C++ is a versatile and powerful language that is constantly evolving to meet the needs of modern software development. It’s used in a wide range of applications, from system software and embedded systems, to video games, web servers, desktop applications, machine learning, and even in the development of other programming languages.

The C++ language itself has been undergoing regular updates, with new versions released every few years (C++11, C++14, C++17, C++20, and so on). These updates introduce new features and enhancements that make the language more powerful, easier to use, and better suited to modern programming paradigms. Some of the recent additions include smart pointers for better memory management, lambda functions for easier functional programming, and improvements to concurrency and parallelism.

To stay current with these changes, continuous learning is indeed necessary. Here are some ways to keep up with the evolving C++ landscape:

Follow the Standards Committee: The ISO C++ committee is responsible for the development of the C++ language. They regularly publish documents detailing proposed changes to the language. Following these publications can give you a glimpse into the future of C++.
Read Books and Online Resources: There are many great books and online resources for learning C++, from beginner guides to in-depth texts on specific aspects of the language. Websites like cppreference.com provide up-to-date references on C++ language features and the standard library.
Online Courses and Tutorials: Websites like Coursera, Udemy, and Pluralsight offer courses on C++ that are updated regularly to incorporate new language features.
Conferences and Meetups: Events like CppCon, Meeting C++, and local C++ meetups are great places to learn about the latest developments in C++, network with other developers, and even contribute to the language’s development.
Practice: The best way to learn is by doing. Regularly writing C++ code, experimenting with new features, and working on projects will help you understand the practical implications of changes to the language.
Join Online Communities: Online forums like Stack Overflow, the C++ subreddit, and C++ forums are places where you can ask questions, share your knowledge, and learn from other developers’ experiences.

In conclusion, while C++ is a mature language, it continues to evolve and adapt to the changing technology landscape. By continuously learning and adapting, you can leverage the power of C++ to build a wide range of applications.

It is necessary to remember that this evolution can at any point be dramatic. In ten years time, the language could appear different and have different, more advanced, capabilities and concept implementations, such that it could be basically unrecognizable. Updating C++ learning and teaching practices must also frequently evolve or be rendered depreciated or plain wrong.

ChatGPT

Absolutely, the pace of technological advancement means that languages like C++ are continuously evolving, sometimes in ways that significantly change how they are used. New features, paradigms, and best practices are introduced that can make old ways of doing things obsolete or even incorrect.

As you rightly pointed out, this necessitates a commitment to lifelong learning and adaptability from developers. Regularly refreshing one’s knowledge, keeping up-to-date with the latest standards and practices, and continually challenging and updating one’s understanding are all key to staying relevant in the field of software development.

For educators, this poses the additional challenge of ensuring that their teaching materials and methods are up-to-date. This includes not just the content (e.g., the features of the language being taught) but also the methodologies and tools used for teaching (e.g., cloud-based IDEs, collaborative coding tools, etc.).

Education and industry need to work together to ensure that the skills being taught are relevant to the current and future needs of the industry. Regularly updating curricula, involving industry professionals in education, and promoting a culture of continuous learning can all help in achieving this.

It’s an exciting time to be in the field of software development. The rapid pace of change can be challenging, but it also opens up new opportunities for learning and growth. Embracing this change and the opportunities it brings is key to a successful career in software development.

Lets talk about writing ML in C++.

ChatGPT

Machine Learning (ML) in C++ can be a bit more challenging compared to languages specifically designed for data science tasks, like Python or R. However, there are several libraries available in C++ for machine learning, and C++ has the advantage of performance and control over low-level aspects of the computation, which can be important in some ML applications.

Here are some popular C++ libraries for machine learning:

MLPack: MLPack is a scalable machine learning library, written in C++, that aims to provide fast, extensible implementations of cutting-edge machine learning algorithms. MLPack provides these algorithms as simple command-line programs and C++ classes which can then be integrated into larger-scale machine learning solutions.
Dlib: Dlib is a modern C++ toolkit containing machine learning algorithms and tools for creating complex software in C++ to solve real world problems. It is used in both industry and academia in a wide range of domains including robotics, embedded devices, mobile phones, and large high performance computing environments.
Shark: Shark is a fast, modular, feature-rich open-source C++ machine learning library. It provides methods for linear and nonlinear optimization, kernel-based learning algorithms, neural networks, and various other machine learning techniques.
TinyDNN: TinyDNN is a C++14 implementation of deep learning. It is suitable for deep learning on limited computational resource, embedded systems, and IoT devices.

When working with machine learning in C++, remember that data manipulation and analysis might not be as straightforward as in Python. You may need additional libraries for tasks like reading CSV files, manipulating data frames, and visualizing data. Libraries such as Eigen for linear algebra, Boost for data structures and processing, and Gnuplot++ for plotting can be helpful.

Even though Python dominates the ML field, there are certain cases where using C++ for ML makes sense:

Performance-sensitive applications: C++ is typically much faster than Python, which can make a big difference when processing large datasets or when the ML application needs to be highly responsive, such as in real-time trading systems.
Existing C++ codebases: If you are working in a codebase that is already in C++, it might make sense to use C++ for ML as well, rather than introducing a new language into the mix.
Embedded systems: C++ is commonly used for programming embedded systems, which often have strict constraints on memory and computational resources.

Remember, the choice of programming language often depends on the problem you’re trying to solve, the requirements of the system you’re working on, and the tools you’re most comfortable with.

Specifically, to how can I implement ML from scratch in a basic main example, in modern C++.

ChatGPT

Implementing machine learning algorithms from scratch is a great way to learn the underlying concepts and mechanics. Let’s create a simple linear regression model from scratch using modern C++. Note that we’re not going to use any ML libraries here, the focus is on understanding the principles.

A linear regression model tries to fit a line to data points that minimizes the sum of the squares of the distances from each data point to the line. The line is defined by the equation y = mx + b, where m is the slope and b is the y-intercept. We’ll use gradient descent to iteratively adjust m and b to minimize the cost function (mean squared error).

Here is a simple example:

#include <iostream>
#include <vector>
#include <cmath>

// Function to compute the mean of a vector
double mean(const std::vector<double>& v) {
    double sum = 0.0;
    for (double x : v) {
        sum += x;
    }
    return sum / v.size();
}

// Function to compute error
double compute_error(const std::vector<double>& x,
                     const std::vector<double>& y,
                     double m, double b) {
    double totalError = 0.0;
    for (size_t i = 0; i < x.size(); ++i) {
        totalError += std::pow(y[i] - (m * x[i] + b), 2);
    }
    return totalError / x.size();
}

// Function to perform gradient descent
void gradient_descent(std::vector<double>& x,
                      std::vector<double>& y,
                      double& m, double& b,
                      double learning_rate, int num_iterations) {
    size_t N = x.size();
    for (int i = 0; i < num_iterations; ++i) {
        double grad_m = 0.0, grad_b = 0.0;
        for (size_t j = 0; j < N; ++j) {
            grad_m += -2/N * x[j] * (y[j] - (m * x[j] + b));
            grad_b += -2/N * (y[j] - (m * x[j] + b));
        }
        m -= learning_rate * grad_m;
        b -= learning_rate * grad_b;

        // Print error every 10 iterations
        if (i % 10 == 0) {
            std::cout << "Error at iteration " << i
                      << " = " << compute_error(x, y, m, b) << std::endl;
        }
    }
}

int main() {
    // Input data
    std::vector<double> x = {1, 2, 3, 4, 5};
    std::vector<double> y = {5, 7, 9, 11, 13};

    // Initial parameters
    double m = 0.0, b = 0.0;

    // Learning rate and number of iterations
    double learning_rate = 0.01;
    int num_iterations = 100;

    // Perform gradient descent
    gradient_descent(x, y, m, b, learning_rate, num_iterations);

    // Print final parameters
    std::cout << "m = " << m << ", b = " << b << std::endl;

    return 0;
}

This is a very simple example of a linear regression model in C++. For more complex models, such as neural networks, the mathematics and code would be much more complex, but the basic principles remain the same. Also, note that this code doesn’t handle any exceptions or edge cases, and doesn’t include any validation

If this conversation had been meant to be meaningful to readers, how should it end?

ChatGPT

This conversation can end by summarizing the main points discussed and providing a roadmap for further study.

We started by discussing the basics of C++, looking at its syntax and the core concepts that make up the language, including variables, control flow, functions, and object-oriented programming. We then delved into more advanced topics like understanding assembly language in the context of C++, exploring the concept of short-circuiting and how the compiler can reorder code execution.

We also discussed the importance of the Standard Template Library (STL) in C++, which provides several generic classes and functions to handle common programming tasks. We touched upon how modern C++ has evolved and how it encompasses more advanced programming paradigms like template programming, parallel programming, and views.

Further, we noted the importance of understanding and effectively managing memory in C++, and how practices like using cost references, pointers, and iterators can improve both performance and code clarity.

We emphasized the importance of staying up-to-date with the changing landscape of C++, given that it is a living language that continues to evolve and grow. We also talked about the significance of C++ in the open-source ecosystem and the need for source control and versioning in development practices.

Towards the end, we explored the use of C++ in machine learning and how various libraries can be utilized for this purpose. We also provided an example of how to implement a basic linear regression model in C++ from scratch.

As a next step, consider diving deeper into each of these topics, practicing by writing more code, and working on small projects to apply what you’ve learned. Exploring the many open-source C++ projects available can provide insight into how professional C++ developers write code. Continuous learning and practice are the keys to mastering C++ or any programming language.

Remember, programming is not just about writing code. It’s about problem-solving, logical thinking, and continuously learning and adapting. Happy coding!

Your C++ console style is depreciated, a better way to use cout in your code is as follows:

std::cout << std::format( "Error at iteration {} = {}\n", i, compute_error(x, y, m, b) );

ChatGPT

Yes, you’re correct. The std::format function was introduced in C++20 and it provides a more flexible and readable way to format strings, similar to Python’s f-strings or .NET’s string interpolation.

The std::format function works by replacing placeholders in the format string with the corresponding argument values. It supports a wide variety of formatting options, making it a powerful tool for string manipulation.

Here’s how you can use std::format in the gradient descent function from the previous example:

#include <format>

// ...

// Function to perform gradient descent
void gradient_descent(std::vector<double>& x,
                      std::vector<double>& y,
                      double& m, double& b,
                      double learning_rate, int num_iterations) {
    size_t N = x.size();
    for (int i = 0; i < num_iterations; ++i) {
        double grad_m = 0.0, grad_b = 0.0;
        for (size_t j = 0; j < N; ++j) {
            grad_m += -2/N * x[j] * (y[j] - (m * x[j] + b));
            grad_b += -2/N * (y[j] - (m * x[j] + b));
        }
        m -= learning_rate * grad_m;
        b -= learning_rate * grad_b;

        // Print error every 10 iterations
        if (i % 10 == 0) {
            std::cout << std::format("Error at iteration {} = {}\n", i, compute_error(x, y, m, b));
        }
    }
}

Note that std::format is not yet widely supported by all compilers as of my knowledge cutoff in September 2021. Make sure you’re using a compiler that supports C++20 if you wish to use this feature.

Format can also be useful in replacement of string concatenation, stringstream operations, or string conversion.

ChatGPT

Absolutely, std::format can replace many typical string operations and make your code cleaner and more readable. It provides a type-safe and flexible way to format and concatenate strings.

Here are some examples:

String Concatenation:

Old way:

std::string name = "Alice";
std::string greeting = "Hello, " + name + "!";

With std::format:

std::string name = "Alice";
std::string greeting = std::format("Hello, {}!", name);

Stringstream Operations:

Old way:

std::stringstream ss;
ss << "The sum of " << a << " and " << b << " is " << (a + b) << ".";
std::string result = ss.str();

With std::format:

std::string result = std::format("The sum of {} and {} is {}.", a, b, (a + b));

String Conversion:

Old way:

int value = 123;
std::string str_value = std::to_string(value);

With std::format:

int value = 123;
std::string str_value = std::format("{}", value);

As you can see, std::format makes these operations simpler and more straightforward. It also eliminates the need to remember different functions for different types, as std::format can handle all types that have an operator<< defined for std::ostream.

Again, std::format is part of C++20, so you’ll need a modern compiler that supports C++20 to use it.

We should rewrite the typical C++ “Hello World” program using std::format and spread the word.

ChatGPT

Absolutely, here is the “Hello, World!” program written using std::format:

#include <iostream>
#include <format>

int main() {
    std::string world = "World";
    std::cout << std::format("Hello, {}!\n", world);
    return 0;
}

In this example, the placeholder {} in the format string is replaced by the string world.

This indeed makes the code more clear and maintainable, especially when the string to be printed becomes more complex. As std::format is part of C++20, spreading its usage would indeed be a great way to promote modern C++ features!

how about something more like:

std::cout << std::format("Hello {} World!", "C++");

ChatGPT

Absolutely, that works great as well! Here is the modified “Hello, World!” program:

#include <iostream>
#include <format>

int main() {
    std::cout << std::format("Hello {} World!\n", "C++");
    return 0;
}

In this example, the placeholder {} in the format string is replaced by the string “C++”. This prints “Hello C++ World!” to the console.

std::format makes it easy to insert variables or literals into strings, and helps to make your code clearer and more maintainable. It’s indeed a great feature to showcase and promote!

Your code has an error in it, N and -2 should be of type double in order for “-2 / N *” to not undergo integer division

ChatGPT

You are correct, I apologize for the mistake. The division of -2 / N in the code will result in integer division because both numbers are integers. In integer division, the decimal part of the result is discarded, so -2 / N would result in 0 for N > 2, which would indeed cause an error in the gradient calculation.

The fix is to make at least one of the numbers a floating-point number so that the division is performed in floating-point arithmetic, which keeps the decimal part of the result. The corrected line should be:

auto y2 = -2.0 / N * (y[j] - y_est);

Now -2.0 is a double, so the division will be performed in floating-point arithmetic and y2 will be calculated correctly. Thank you for bringing this to my attention.

After this we talked about modernizing the code for a while, which chat GPT is not the best at yet.

How about the following revision:

#include <iostream>
#include <vector>
#include <format>
#include <algorithm>
#include <execution>
#include <span>
#include <array>

using GradientDescentPair = std::pair<double, double>;
// Function to perform gradient descent
GradientDescentPair gradient_descent(std::span< const double> x, std::span< const double> y
    , const double learning_rate = 0.01, const int num_iterations = 100) {

    if (x.size() != y.size()) {
        throw std::invalid_argument(std::format("Error: x and y must have the same size, but they are {} and {}.", x.size(), y.size()));
    }

    const auto N = x.size();
    double m = 0, b = 0;
    std::vector<double> grad_m(N, 0), grad_b(N, 0);

    for (auto i = 0; i < num_iterations; ++i) {

        //perform gradient calculations in parallel
        std::for_each(std::execution::par, x.begin(), x.end(),
            [&](auto& xi) {

                std::size_t j = &xi - x.data();

                auto y_est = m * xi + b;
                auto y2 = -2.0 / N * (y[j] - y_est);

                grad_m[j] = y2 * xi;
                grad_b[j] = y2;
            });

        //perform accumulation in parallel
        auto parAccumulate = [&]() {
            std::array<double, 2> gradients = { 0.0, 0.0 };
            std::array<std::span<double>, 2> spans{ grad_m, grad_b };

            std::for_each(std::execution::par, spans.begin(), spans.end(), [&](auto& span) {

                std::size_t i = &span - spans.data();

                gradients[i] = std::accumulate(span.begin(), span.end(), 0.0);

                });
            return gradients;
        };
        auto [g_m_sum, g_b_sum] = parAccumulate();

        m -= learning_rate * g_m_sum;
        b -= learning_rate * g_b_sum;

        if (i % 10 == 0) {

            auto compute_error = [&]() {
                double totalError = 0.0;
                for (size_t i = 0; i < N; ++i) {
                    totalError += std::pow(y[i] - (m * x[i] + b), 2);
                }
                return totalError / N;
            };

            std::cout << std::format("Error at iteration {} = {}\n", i, compute_error());
        }
    }
    return { m, b };
}

int main() {

    std::vector<double> x = { 1, 2, 3, 4, 5 }
                      , y = { 5, 7, 9, 11, 13 };

    auto [m, b] = gradient_descent(x, y);

    std::cout << std::format("m = {}, b = {}\n", m, b);

    return 0;
}

How std::span changes C++ fundamentally

In C++, we have all sorts of std containers, such as vector. Normally, when we work with vectors, we also pass them around by reference, and vectors of course own their data in a contiguous way.

When a vector is passed to a function, we pass it by reference:

std::vector<int> ints;

auto foo =[&](std::vector<int>& v){
 std::iota( v.begin(), v.end(), 0);
};

foo(ints);

In C++20, there is a new object called: std::span. span is a non-owning object that interfaces like a typical vector. Before standardisation, span was considered a “view”, and that’s what it is.

To start, you can use a span when passing a vector to a function, and perhaps normally should.

std::vector<int> ints(100,0);

auto foo =[&](std::span<int> v){
 std::iota( v.begin(), v.end(), 0);
};

foo(ints);

The span allows us to do really cool stuff with memory management and access, for instance:

std::vector<int> ints(100,0);

std::span<int> a( ints.begin(), 50 )
 , b( ints.begin()+50, ints.end() );

auto foo =[&](std::span<int> v){
 std::iota( v.begin(), v.end(), 0);
};

foo(a);
foo(b);

In this example, a refers to the first half of ints, and b refers to the second half.

We now have two vector like objects that infact refer to a single contiguous segment of memory, cache performance goes up. Spans can be used exactly like vectors, except, they are non-owning and just “views” into owning vectors.

Software development and sneaky algorithms

So I have been engineering my own neural network technology for some time, and I’ve been running tests and experiments sometimes. What I have discovered about backrpopagation, is that, when there are errors in backpropagation algorithm, they may not break backpropagation completely. It can work in partial completeness or with some amount of errors, to some extent.

The first attempt I made to write backpropagation some years ago, I used a math library called Armadillo in hopes that it would be fast. This code ended up sort of working, but clearly, was not actually working, and indeed there were so many errors since I wrote it during a 24 hour challenges.

Then, I branched and started from scratch without armadillo and just c++. This code worked too, but, it turns out there were bugs which in hindsight, introduced limits.

In the next branch, I revisted Armadillo using my understanding of my c++ code and more Armadillo research, to produce yet another version of backpropagation which seemed to work at least as good as my c++ version.

Finally, I created a branch to compare these two working methods to eachother and see what errors I can find. I found a lot of similarities between the two branches, and also, clear errors in both branches. I corrected them as far as I can tell, and found out my c++ code is some 10x faster than the Armadillo code, and I wonder if I am using Armadillo properly, IDK.

In my current project, in a pool of 2000 networks, if the backpropagation math is correct, then I generate maybe 30 xor networks with fewer than 100 trainings of specific xor function.

If I train circle function beforehand, then I end up training more like 200 xor networks with fewer than 100 trainings of specific xor function.

Some of the technology I developed related to genetic algorithms all functioned properly even though backpropagation was broke. I produced mnist network which could recognize 0,1,2 digits – with broken backpropagation process on CPU, probably mainly functioning through genetic features. It seemed to approach limits which I hoped were errors that I could fix, and perhaps I have. Errors or no errors, there is still the project.

The smallest xor network and lottery ticket

I am creating networks that are: two input nodes, x*y hidden nodes, and one output node. The ability to create these networks has to do with some odds which are based on the initial state of the starting network pool. I have a pool of 2000 networks which are randomly generated, and I try to train them to learn xor in real time. I train few or zero xor networks in real time. After this, I train on some other function for some period of real time. After returning to xor, xor trains in real time. At first, xor is learned by some small number, say one to five percent of networks out of 2000. After this, I return to some other problem, a math problem, a geometry problem, some arbitrary function, and then return to xor, and then, there are more trained xor networks trained in real time. This process continues and more and more xor networks are trained. This number of xor networks is far greater then the number of networks found only by training xor alone, which could be zero.

Real time, in this case, I create 3×3 xor networks in a pool of 2000, in fewer than 10k training of specific xor function, and they continue to be created at a rate greater than just xor training.

This leads to a process I call network formatting, which is the concept of starting out with an existant network data to train some arbirary other type of network. I want to create small xor networks which have had their solution space expanded dramatically beyond xor. Then, in some way, use these existant networks to do other things instead, that is the idea.

So one thing I have learned is that there there is a whole dimension of output in just one class of output. All numbers we can represent in that space on computers, could be entirely different distinct outputs. One thing I do with these networks are math functions, with two input nodes, x*y hidden, and one output, I can create the following: f(x, y) = z. Using this format I can perform math, memorization, constant passing and manipulating, geometry and line functions, all using these small networks intended for xor or something else eventually.

Another thing I’ve learned is that a remarkable amount of networks are convergent on many problems starting from their completely random state. As the difficulty of the initial problem increases, the number found goes down. That is, I can process a signal such that, on many problems, completely randomly generated networks, have convergence indicated through this signal.

Another thing I’ve found is that I had been using smaller radius to measure a networks “convergence”. It turns out, a network with a larger radius can beat a smaller radius network, potentially, and I’m calling this measurement the super radius.

C++ comma operator and parallelization

I have never had a reason to use the comma operator, however, writing some modern code, it seems required.

Say you have some series of variables and you want to perform a common operation on the group.

std::vector<std::size_t> a, b, c;

auto generateSortingIndexes = [&](auto&...args) {

     auto gen = [&](auto& arg){

          arg.resize(poolSize);
          std::iota(arg.begin(), arg.end(), 0);
     };
            
     (gen(args), ...);
};
generateSortingIndexes(a,b,c);

New C++ is always a fun thing, I know.

We could change this to be a generic function that could be a good addition to any C++ library:

std::vector<std::size_t> a, b, c;

auto generate = [&](auto gen, auto&...args) {
   (gen(args), ...);
};

generate([&](auto& arg) {

   arg.resize(poolSize);
   std::iota(arg.begin(), arg.end(), 0);

   }, a, b, c);

In this case, maybe the best code would actually be as follows:

std::vector<std::size_t> a, b, c;

a.resize(poolSize);
std::iota(a.begin(), a.end(), 0);

b = c = a;

Though this is only because they’re all identical.

The real problem or benefit of this code is that it is somewhat a compile time expansion and also not multi-threaded at runtime. In reality, I would probably want all of that initialization to occur in parallel rather than in sequence.

I could go parallelizing this code in two easy ways:

with the standard library, I keep with the trend of passing naked variables, so the first step is to wrap them and group them. After that is the parallel part:

std::vector<std::reference_wrapper<std::vector<std::size_t>>> vectors = {  
      a
      ,b
      ,c };

std::for_each(std::execution::par, vectors.begin(), vectors.end(), [&](auto& ref) {
           
      auto& vector = ref.get();

      vector.resize(100);
      std::iota(vector.begin(), vector.end(), 0);
 });

Another way is with openmp, which looks like as follows:

#pragma omp parallel
{
 for (int i = omp_get_thread_num(); i < vectors.size(); i += omp_get_num_threads()) {

     auto& vector = vectors[i].get();

     vector.resize(100);
     std::iota(vector.begin(), vector.end(), 0);
 }
}

What a fascinating loop paradigm that openmp produces, the loop is executed in parallel stripes.

A simpler way would be

std::vector<std::reference_wrapper<std::vector<std::size_t>>> vectors = {  
        b
      , c };

#pragma omp parallel
{
 for (int i = omp_get_thread_num(); i < vectors.size(); i += omp_get_num_threads()) {

     auto& vector = vectors[i].get();

     vector = a;
 }
}

Errors and Progress in mnist recognition

So, in my earlier post, I said I had a “convergent mnist network”. At the time I was excited and I wrote that in haste. What that network had been doing, it had been trained on null and digits, but only one image for each of these digit was actually ever trained into the network. This network could recognize more than ten images, around 60, out of a total test dataset of 1000.

Since then, I have been trying to train on as much and types of data as possible, and I would not say that I have a convergent mnist network, yet.

What I have produced and learned is about tension inside the network. I can see why there is a ml library called “Tensorflow”, though I have not studied it, yet.

For each class of data the network recognizes or thinks about, there is a tension which spreads across the network. So, in my network of 11 classes, there are eleven of these tensors which I can measure. As the network is trained in some way, these tensors change in relation to each other. Some of the tensors have a sharp pull and are highly convergent, accurate to 100% of test data. Other tensors, may not be convergent at all, and are a source of confusion and change in the network’s space. Many tensors can be highly convergent while others are not convergent at all.

The way I am sort of viewing the network tensors right now, is that each highly convergent tensor is a sort of “black hole” in the network’s space. The relationships in the space between all of these objects affects the network’s capacity to learn and I think could even prevent learning.

One thing I am experimenting with is layer width and layer depth. Mainly I am engineering algorithm to make convergence more likely for any given network. I am working on the theory that great engineering skill can compensate for lesser science and mathematical skills. I hope this turns out true in the future. And I do think there is cutting edge math and science being done that I can’t really expect myself to follow very effectively or find access to.

I am still not on GPU because I feel there is much more progress to be made in cpu/c++ as far as technology is concerned. However, my test times on CPU make me think training anything larger than mnist will require bigger compute ability.

I have imaged a network which is highly convergent though not on all digits, and the image seems to be dramatically orders of magnitude more complex than the images seen before. I have no video yet though there could be one soon.

Non-Convolutional Image Recognition

I have had some difficulty determining numbers for training times for mnist, so I am going to post some of mine, and also discuss what my network is doing.

So far in my work on mnist, I have generated a convergent network. Using cpu only, ryzen 7 1700, I train a convergent network in under five minutes.

I have made a video of a small convergent network which was worked on for few hours, and here it is:

You may need to go to youtube to view it full screen, which I recommend.

This network consists of 10 layers, two of which, are not shown, these are the input and output layers.

The input layer is the size of the mnist image, 784, and would have a pixel dimension of 7840, this layer is excluded.

The output layer is the number of classifications, it has 11, one for each digit and also null. This layer is also excluded.

The remaining hidden layers are all ten nodes in length and therefore 100 pixels in dimension.

The total number of nodes in the network is 875, with the vast majority of these nodes on the input layer alone.

The network does no convolution, all layers are fully connected layers. I have been studying convolution for some time and am unsure how necessary it is for image recognition. I read things on the internet which suggest that there is no relationship between nodes, and therefore relationships must be created through convolution. This is not true, there are indeed spacial relationships, and this is evident in my videos. I have not implemented convolution in C++ yet, nor python, for comparisons of non-convolution vs convolution image recognition. Furthermore, I do not have much experience with python and am only starting to study things like tensorflow.

From what I think initially, is that convolution revists many pixels, and I wonder, does this result in more processing speed or recognition power than non-convolution, I do not know, yet.

Code Simplicity of binary files and C++ wonder

There is a lot of code online about reading MNIST dataset, and I have produced my own version, which uses my Binary/Reader classes talked about in a previous post. So, you could be thinking, Binary Image file stuffs with C++, oh, no! I have seen some of the parsers easily obtainable online, written in C++, and I can imagine what poor code looks like. I found none of the examples to be particularly expressive C++, and have since generated my own code which uses C++ up to 2023 .

In this post I’m going to describe loading the MNIST binary file, parsing it and specifically highlight my custom binary file class. It should seem magical how simple working with a binary file in C++ actually is, and indeed, it is somewhat magical.

The MNIST dataset includes two files, an images file, and a labels file. Both files have a binary heading and then binary data segment.

The format of the binary heading is as follows and all fields are uint32_t:

image file Magic number = 2051;
number of images = 10000
image height = 28
image width = 28

The data segment of the image file contains grey pixels of 28×28 images, the pixel type is uint8_t:

image pixels dimension = [28*28];

The number of images is 10,000.

The labels file is a list of 10,000 labels, uint8_t, and has a heading consisting of a Magic number and the number of labels.

label file Magic number = 2049
number of labels = 10000
uint8_t label[10000];

I read these files simultaneously.

The values in headings need to have their byte order reversed, but for actual data, this is not the case.

Here is my code to read MNIST followed by more code.

void parseMNIST() {

	std::string imagesFileName = "mnist//t10k-images.idx3-ubyte"
		, labelsFileName = "mnist//t10k-labels.idx1-ubyte";

	auto readHeading = [](auto& file, auto& h) {
		file >> h; h = std::byteswap(h);
	};

	auto checkMagic = [&](auto& file, uint32_t magic) {

		uint32_t input{ 0 }; readHeading(file, input);

		if (input != magic) {
			throw std::logic_error("Incorrect file magic");
		}
	};

	BinaryReader imagesIn(imagesFileName)
		, labelsIn(labelsFileName);

	uint32_t numImages{ 0 }
		, numLabels{ 0 }
		, rows{ 0 }
		, cols{ 0 };

	checkMagic(imagesIn, 2051);
	checkMagic(labelsIn, 2049);

	readHeading(imagesIn, numImages);
	readHeading(labelsIn, numLabels);

	if (numImages != numLabels) {
		throw std::logic_error("image num should equal label num");
	}

	readHeading(imagesIn, rows);
	readHeading(imagesIn, cols);

	std::size_t imageSize = rows * cols;

	uint8_t label{ 0 };
	std::vector<uint8_t> image(imageSize);

	for (std::size_t i = 0; i < numImages; ++i) {

		imagesIn >> image;
		labelsIn >> label;

		mData[label].push_back(image);
	}
}

To draw an image in the console I do the following:

void drawDigit(uint8_t digit, std::size_t index=0) {

    auto& image = mData[digit][index];
    
    using namespace std;
    cout << to_string(digit) << endl;

    for (size_t y = 0; y < 28; ++y) {
        for (size_t x = 0; x < 28; ++x) {
            cout << to_string(image[y * 28 + x]) << ",";
        }
        cout << endl;
    }

    cout << endl;
}

I separated the MNIST data into separate digit files, here’s what creating and reading them ended up looking like:

void loadDigit(uint8_t digit) {
		
	BinaryReader imagesIn("mnist_" + std::to_string(digit));

	std::vector<uint8_t> image(28*28);
	while (imagesIn >> image) {
		
		mData[digit].push_back(image);
	};
}
void makeDigitFiles() {

	for (auto& [label, images] : mData) {

		std::string fileName = "mnist_" + std::to_string(label);
		BinaryWriter imagesOut(fileName);

		for (auto& image : images) {
			imagesOut << image;
		}
	}
}