멀티쓰레딩 백터내적 예시 코드(홍정모의 TBCPP)

이도현·2023년 11월 8일
0

해설
https://jiwanm.github.io/cpp/chapter19-6/

코드

#include <iostream>
#include <chrono>
#include <mutex>	
#include <utility>
#include <vector>	
#include <atomic>
#include <numeric>		// std::inner_product
#include <random>
#include <execution>	// parallel execution
#include <future>
#include <thread>

using namespace std;

mutex mtx;

void dotProductNaive(const vector<int>& v0, const vector<int>& v1,
	const unsigned int start, const unsigned int end, unsigned long long& sum)
{
	for (unsigned int i = start; i < end; ++i)
	{
		sum += (v0[i] * v1[i]);
	}
}

void dotProductLock(const vector<int>& v0, const vector<int>& v1,
	const unsigned int start, const unsigned int end, unsigned long long& sum)
{
	//cout << "Thread Start" << endl;
	for (unsigned int i = start; i < end; ++i)
	{
		std::scoped_lock lock(mtx);
		sum += (v0[i] * v1[i]);
	}
	//cout << "Thread End" << endl;
}

void dotProductAtomic(const vector<int>& v0, const vector<int>& v1,
	const unsigned int start, const unsigned int end, atomic<unsigned long long> & sum)
{
	for (unsigned int i = start; i < end; ++i)
	{
		sum += (v0[i] * v1[i]);
	}
}

auto dotProductFuture(const vector<int>& v0, const vector<int>& v1,
	const unsigned int start, const unsigned int end)
{
	int sum = 0;
	for (unsigned int i = start; i < end; ++i)
		sum += (v0[i] * v1[i]);
	return sum;
}


int main()
{
	const long long numData = 100'000'000;
	const unsigned int numThread = 4;
	// thread 개수 늘린다고 degree of multithreading이 높아지지는 않음

	vector<int> vec0, vec1;
	vec0.reserve(numData);
	vec1.reserve(numData);

	std::random_device seed;
	std::mt19937_64 makerand(seed());
	std::uniform_int_distribution<> range(1, 10);

	for (long long i = 0; i < numData; ++i)
	{
		vec0.push_back(range(makerand));
		vec1.push_back(range(makerand));
	}

	cout << "실험 1번 - std::inner_product\n";
	{
		const auto sta = std::chrono::steady_clock::now();		// 시간 측정 시작
		const auto sum = std::inner_product(vec0.begin(), vec0.end(), vec1.begin(), 0ull);
		// 두 벡터의 개수가 같다고 가정하므로 vec1은 begin만 있어도 되며 ull은 unsigned long long의 약자
		const chrono::duration<double> dur = chrono::steady_clock::now() - sta;		

		cout << "연산에 소요된 시간 = " << dur.count() << '\n';
		cout << "결과값 = " << sum << '\n';
		cout << '\n';
	}

	cout << "실험 2번 - Naive\n";
	{
		const auto sta = std::chrono::steady_clock::now();
		
		unsigned long long sum = 0;

		vector<std::thread> threads;
		threads.resize(numThread);

		const unsigned int numPerThread = numData / numThread;
		for (unsigned int t = 0; t < numThread; ++t)
			threads[t] = std::thread(dotProductNaive, std::ref(vec0), std::ref(vec1),
				t * numPerThread, (t + 1) * numPerThread, std::ref(sum));
		for (unsigned int t = 0; t < numThread; ++t)
			threads[t].join();

		const chrono::duration<double> dur = chrono::steady_clock::now() - sta;

		cout << "연산에 소요된 시간 = " << dur.count() << '\n';
		cout << "결과값 = " << sum << '\n';
		cout << '\n';
	}

	cout << "실험 3번 - Lockguard\n";
	{
		const auto sta = std::chrono::steady_clock::now();
		
		unsigned long long sum = 0;

		vector<std::thread> threads;
		threads.resize(numThread);

		const unsigned int numPerThread = numData / numThread;
		for (unsigned int t = 0; t < numThread; ++t)
			threads[t] = std::thread(dotProductLock, std::ref(vec0), std::ref(vec1),
				t * numPerThread, (t + 1) * numPerThread, std::ref(sum));
		for (unsigned int t = 0; t < numThread; ++t)
			threads[t].join();

		const chrono::duration<double> dur = chrono::steady_clock::now() - sta;

		cout << "연산에 소요된 시간 = " << dur.count() << '\n';
		cout << "결과값 = " << sum << '\n';
		cout << '\n';
	}

	cout << "실험 4번 - Atomic\n";
	{
		const auto sta = std::chrono::steady_clock::now();
		
		atomic<unsigned long long> sum = 0;

		vector<std::thread> threads;
		threads.resize(numThread);

		const unsigned int numPerThread = numData / numThread;
		for (unsigned int t = 0; t < numThread; ++t)
			threads[t] = std::thread(dotProductAtomic, std::ref(vec0), std::ref(vec1),
				t * numPerThread, (t + 1) * numPerThread, std::ref(sum));
		for (unsigned int t = 0; t < numThread; ++t)
			threads[t].join();

		const chrono::duration<double> dur = chrono::steady_clock::now() - sta;

		cout << "연산에 소요된 시간 = " << dur.count() << '\n';
		cout << "결과값 = " << sum << '\n';
		cout << '\n';
	}

	cout << "실험 5번 - Future\n";
	{
		const auto sta = std::chrono::steady_clock::now();

		unsigned long long sum = 0;

		vector<std::future<int>> futures;
		futures.resize(numThread);

		const unsigned int numPerThread = numData / numThread;
		for (unsigned int t = 0; t < numThread; ++t)
			futures[t] = std::async(dotProductFuture, std::ref(vec0), std::ref(vec1),
				t * numPerThread, (t + 1) * numPerThread);
		for (unsigned int t = 0; t < numThread; ++t)
			sum += futures[t].get();
		
		const chrono::duration<double> dur = chrono::steady_clock::now() - sta;

		cout << "연산에 소요된 시간 = " << dur.count() << '\n';
		cout << "결과값 = " << sum << '\n';
		cout << '\n';
	}
	// TODO : Future의 divide and conquer 방식은 thread에서도 구현해보자.
	// async 대신에 thread와 promise를 사용해서 future을 사용해보자.
	
  cout << "실험 6번 - promise\n";
	{
		const auto sta = std::chrono::steady_clock::now();
		
		vector<std::promise<int>> prom;
		vector<std::future<int>> future;
		vector<std::thread> threads;
		prom.resize(numThread);
		future.resize(numThread);
		threads.resize(numThread);
		
		unsigned long long sum(0);
		unsigned numPerThread = numData / numThread;
		for (unsigned i = 0; i < numThread; ++i)
		{
			future[i] = prom[i].get_future();
			unsigned long long tempSum(0);
			threads[i] = std::thread([&](std::promise<int>&& prom)
				{
					std::scoped_lock lock2(mtx);
					
					for (unsigned int j = 0; j < numPerThread; ++j)
						tempSum += (vec0[j] * vec1[j]);
					prom.set_value(tempSum);
				}, std::move(prom[i]));
			threads[i].join();
		}
		for (unsigned int t = 0; t < numThread; ++t)
			sum += future[t].get();

		const chrono::duration<double> dur = chrono::steady_clock::now() - sta;

		cout << "연산에 소요된 시간 = " << dur.count() << '\n';
		cout << "결과값 = " << sum << '\n';
		cout << '\n';
	}

	cout << "실험 7번 - std::transform_reduce\n";
	{
		const auto sta = std::chrono::steady_clock::now();
		const auto sum = std::transform_reduce(std::execution::par, vec0.begin(), vec0.end(), vec1.begin(), 0ull);
		const chrono::duration<double> dur = chrono::steady_clock::now() - sta;

		cout << "연산에 소요된 시간 = " << dur.count() << '\n';
		cout << "결과값 = " << sum << '\n';
		cout << '\n';
	}
}

출력

실험 1번 - std::inner_product
연산에 소요된 시간 = 0.068767
결과값 = 3025113384

실험 2번 - Naive
연산에 소요된 시간 = 0.04685
결과값 = 889569211

실험 3번 - Lockguard
연산에 소요된 시간 = 2.15617
결과값 = 3025113384

실험 4번 - Atomic
연산에 소요된 시간 = 1.31186
결과값 = 3025113384

실험 5번 - Future
연산에 소요된 시간 = 0.0189238
결과값 = 3025113384

실험 6번 - promise
연산에 소요된 시간 = 0.0189238
결과값 = 3025113384

실험 7번 - std::transform_reduce
연산에 소요된 시간 = 0.014745
결과값 = 3025113384
  • 1 inner_product 빠름
  • 2 순진한 멀티쓰레딩. 바르지만 레이스 컨디션으로 값이 부정확
  • 3 lock gaurd 느림
  • 4 atomic 느림, 아토믹은 원래 트림
  • 5 task_based parallelism (std::async) 빠름
  • 6 task_based parallelism (std::thread, std::promise) 빠름
  • 7 std::straform_reduce 빠름
profile
좋은 지식 나누어요

0개의 댓글