Curvesort is an adaptive integer sorting algorithm I discovered independently. It maintains a doubly-linked list, whose nodes have the following structure (in pseudo-C code):
struct node {
int key;
int count;
node* next;
node* prev;
}
Above, count gives you the number of times key appeared in the sequence so far. The entire list is sorted by node.key values. Also, as the algorithm scans the input sequence, it keeps track of the last processed integer and the node that holds it. This allows us to insert a new element faster if it is "closer" to the previously processed integer.
The intuition behind the adaptiveness of the algorithm is as follows: if you draw a "graph" of the input sequence \$A\$ as a set of points \$(i, A_i)\$, the "smoother" the graph, the faster the algorithm finishes its work.
So the best running time is linear. Otherwise, it is not worse than \$\Theta(nk)\$, where \$k\$ is the number of distinct integers. Since \$k \leq n\$, the worst case running time is not worse than \$\Theta(n^2)\$.
My code is as follows:
curvesort.h:
#ifndef CURVESORT_H
#define CURVESORT_H
#include <algorithm>
#include <iostream>
#include <iterator>
#include <type_traits>
namespace net {
namespace coderodde {
namespace sorting {
template<typename Iter>
void sort_impl(Iter begin, Iter end, std::true_type)
{
using value_type =
typename std::iterator_traits<Iter>::value_type;
class Curvesort {
struct Node {
value_type key;
size_t count;
struct Node* prev;
struct Node* next;
Node(value_type key) : key{key}, count{1} {}
};
value_type previous_key;
Node* head;
Node* tail;
Node* prev_updated; // Points to the most recently added or
// updated node.
public:
// We choose to add the first value in the constructor so
// that in count we do not have to make an additional check
// whether the node list is empty or not. This adds to
// efficiency.
Curvesort(value_type first_value)
:
previous_key{first_value}
{
Node* node = new Node(first_value);
node->next = nullptr;
node->prev = nullptr;
head = node;
tail = node;
prev_updated = node;
}
~Curvesort()
{
Node* next;
while (head)
{
next = head->next;
delete head;
head = next;
}
}
void update_smaller_node(value_type key)
{
Node* tmp = prev_updated->prev;
while (tmp && tmp->key > key)
{
tmp = tmp->prev;
}
if (tmp == nullptr)
{
// 'key' is smaller than any currently stored key.
// Therefore, we update the list head node:
Node* newnode = new Node(key);
newnode->next = head;
newnode->prev = nullptr;
head->prev = newnode;
head = newnode;
prev_updated = newnode;
}
else if (tmp->key == key)
{
// Node exists, just increment the counter:
tmp->count++;
prev_updated = tmp;
}
else
{
// Insert a new node between 'tmp' and 'tmp.next':
Node* newnode = new Node(key);
newnode->prev = tmp;
newnode->next = tmp->next;
newnode->prev->next = newnode;
newnode->next->prev = newnode;
prev_updated = newnode;
}
}
void update_greater_node(value_type key)
{
Node* tmp = prev_updated->next;
while (tmp && tmp->key < key)
{
tmp = tmp->next;
}
if (tmp == nullptr)
{
// 'key' is larger than any currently stored key.
// Therefore, we update the list tail node:
Node* newnode = new Node(key);
newnode->prev = tail;
newnode->next = nullptr;
tail->next = newnode;
tail = newnode;
prev_updated = newnode;
}
else if (tmp->key == key)
{
// Node exists, just increment the counter:
tmp->count++;
prev_updated = tmp;
}
else
{
// Insert a new node between 'tmp.prev' and 'tmp':
Node* newnode = new Node(key);
newnode->prev = tmp->prev;
newnode->next = tmp;
tmp->prev->next = newnode;
tmp->prev = newnode;
prev_updated = newnode;
}
}
void count(Iter begin, Iter end)
{
begin++; // Omit the first value since we added to the
// node list.
while (begin != end)
{
value_type current_key = *begin;
if (current_key < previous_key)
{
update_smaller_node(current_key);
}
else if (current_key > previous_key)
{
update_greater_node(current_key);
}
else
{
prev_updated->count++;
}
previous_key = current_key;
begin++;
}
}
void build(Iter begin)
{
Iter iter = begin;
for (Node* node = head; node; node = node->next)
{
size_t count = node->count;
value_type key = node->key;
for (size_t i = 0; i != count; ++i)
{
*iter++ = key;
}
}
}
};
Curvesort curvesort(*begin);
curvesort.count(begin, end);
curvesort.build(begin);
}
template<typename Iter>
void sort_impl(Iter begin, Iter end, std::false_type)
{
// Not a sequence of primitive integral types. Fall back to
// std::sort.
std::sort(begin, end);
}
template<typename Iter>
void sort(Iter begin, Iter end)
{
using value_type =
typename std::iterator_traits<Iter>::value_type;
sort_impl(begin, end, std::is_integral<value_type>());
}
} // End of 'net::coderodde::sorting'
} // End of 'net::coderodde'
} // End of 'net'
#endif // CURVESORT_H
main.cpp:
#include "curvesort.h"
#include <algorithm>
#include <cmath>
#include <cstdint>
#include <iostream>
#include <list>
#include <vector>
using std::boolalpha;
using std::cout;
using std::endl;
using std::equal;
using std::list;
using std::vector;
class CurrentTime {
std::chrono::high_resolution_clock m_clock;
public:
uint64_t milliseconds()
{
return std::chrono::duration_cast<std::chrono::milliseconds>
(m_clock.now().time_since_epoch()).count();
}
};
static const int64_t amplitude = 5000;
static const size_t phase_length = 100 * 1000;
static const size_t length = 20 * 1000 * 1000;
template<typename Container>
void sin_populate_container(Container& cont,
const size_t length,
const int64_t amplitude,
const size_t phase_length)
{
for (size_t i = 0; i != length; ++i)
{
cont.push_back(
(int64_t) amplitude * sin(2.0 * M_PI * i / phase_length)
);
}
}
int main() {
CurrentTime ct;
//// std::vector demo ////
cout << "std::vector demo:" << endl;
vector<int64_t> vec;
sin_populate_container(vec, length, amplitude, phase_length);
vector<int64_t> vec2(vec);
uint64_t start = ct.milliseconds();
net::coderodde::sorting::sort(vec.begin(), vec.end());
uint64_t end = ct.milliseconds();
cout << "Curvesort in " << (end - start) << " milliseconds." << endl;
start = ct.milliseconds();
std::sort(vec2.begin(), vec2.end());
end = ct.milliseconds();
cout << "std::sort in " << (end - start) << " millisecond." << endl;
cout << "Algorithms agree: "
<< boolalpha
<< equal(vec.begin(), vec.end(), vec2.begin()) << endl;
//// std::list demo ////
cout << "std::list demo:" << endl;
list<int64_t> lst;
sin_populate_container(lst, length, amplitude, phase_length);
list<int64_t> lst2(lst);
start = ct.milliseconds();
net::coderodde::sorting::sort(lst.begin(), lst.end());
end = ct.milliseconds();
cout << "Curvesort in " << (end - start) << " milliseconds." << endl;
start = ct.milliseconds();
lst2.sort();
end = ct.milliseconds();
cout << "std::list.sort in " << (end - start) << " millisecond." << endl;
cout << "Algorithms agree: "
<< boolalpha
<< equal(lst.begin(), lst.end(), lst2.begin()) << endl;
}
Compiled with -O3, I get the following performance figures:
std::vector demo: Curvesort in 122 milliseconds. std::sort in 604 millisecond. Algorithms agree: true std::list demo: Curvesort in 294 milliseconds. std::list.sort in 5836 millisecond. Algorithms agree: true
As always, any critique is much appreciated.
std::map<T, std::size_t>then sorting by keys? \$\endgroup\$cont.push_back(rand());and now it is horrible. \$\endgroup\$std::sortwhenever the number of distinct integers (\$k\$) exceeds a threshold depending on \$n\$. \$\endgroup\$lengthparameter from20*1000*1000, then I would expect a run duration of about 100 days. Forlength=100*1000it already takes 1 min for me. \$\endgroup\$