here below a working implementation that finds the minimal distance between k(set =4 below) clusters in a graph.
I have doubts mainly on the implementation of the Disjoint Set structure: instead of using STL containers, I went along with an array of pointers as data structure hosting the leader nodes, in order to have some practice with C-style pointers.
Here is the Graph structure:
Graph.h
// creates and manages a graph data structure
#ifndef GRAPH_H_INCLUDED
#define GRAPH_H_INCLUDED
#include <array>
#include <vector>
#include <list>
#include <fstream>
#include <string>
#include <iostream>
class Edge
{
public:
// constructors
Edge();
Edge(int, int, int);
std::array<int, 2> getNodes() const;
void setNodes(const int, const int);
int getCost() const;
void setCost(const int);
void swapNodes();
// a get() that allows writing:
int operator[](const int);
bool operator==(const Edge&) const;
bool operator!=(const Edge&) const;
private:
int node1, node2; // nodes are just indices of the nodes in the graph
int cost;
};
class Node
{
friend class Graph; // friendship needed by printing routine in Graph
public:
// constructors
Node();
Node(const Edge&);
int getLabel() const {return label;}
Edge getEdge(const int) const;
void setLabel(const int in) {label = in;}
void addEdge(const Edge in) {edges.push_back(in);}
void addManyEdges(const std::vector<Edge>);
int getScore() const {return score;}
void setScore(const int in) {score = in;}
std::list<Edge>::size_type size() const {return edges.size();} // size of list 'edges'
// iterators
typedef std::list<Edge>::iterator iterator;
typedef std::list<Edge>::const_iterator const_iterator;
std::list<Edge>::iterator begin() {return edges.begin();}
std::list<Edge>::iterator end() {return edges.end();}
std::list<Edge>::const_iterator cbegin() const {return edges.begin();}
std::list<Edge>::const_iterator begin() const {return edges.begin();}
std::list<Edge>::const_iterator cend() const {return edges.end();}
std::list<Edge>::const_iterator end() const {return edges.end();}
// inserts group of edges to the end of a edges vector
std::list<Edge>::iterator insertEdges(const std::list<Edge>::iterator beg_in,
const std::list<Edge>::iterator end_in)
{
return edges.insert(end(), beg_in, end_in);
}
// erase node
std::list<Edge>::iterator erase(int);
bool operator==(const Node&) const;
bool operator!=(const Node&) const;
private:
int label;
std::list<Edge> edges;
int score; // new, starts at 10000000, is equal to lowest cost Edge in 'edges'
};
class Graph
{
public:
Graph();
// constructor from txt file
Graph(const std::string&);
Node getNode(const int index) const {return nodes[index];}
void addNode(const Node in) {nodes.push_back(in);}
std::vector<Node>::size_type size() {return nodes.size();} // size of vector 'nodes'
std::vector<Node>::size_type size() const {return nodes.size();} // size of vector 'nodes'
void output() const; // prints graph
void output(const int) const;
// iterators
typedef std::vector<Node>::iterator iterator;
typedef std::vector<Node>::const_iterator const_iterator;
std::vector<Node>::iterator begin() {return nodes.begin();}
std::vector<Node>::iterator end() {return nodes.end();}
std::vector<Node>::const_iterator begin() const {return nodes.begin();}
std::vector<Node>::const_iterator end() const {return nodes.end();}
Node& operator[](const int index)
{
return nodes[index];
}
std::vector<Node>::iterator erase(const int index)
{
return nodes.erase(nodes.begin() + index);
}
private:
std::vector<Node> nodes;
};
bool compareCosts(const Edge&, const Edge&);
bool compareScores(const Node&, const Node&);
bool compareLabels(const Node&, const Node&);
#endif // GRAPH_H_INCLUDED
Graph.cpp:
#include <iostream>
#include <fstream>
#include <array>
#include <list>
#include <string>
#include <algorithm>
#include "Graph.h"
using std::array; using std::ifstream;
using std::string; using std::endl;
using std::cout; using std::list;
using std::equal;
// Edge
// constructors
Edge::Edge():node1(0), node2(0), cost(0) {}
Edge::Edge(int node_1, int node_2, int len): node1(node_1), node2(node_2), cost(len) {}
array<int, 2> Edge::getNodes() const
{
array<int, 2> ret = {node1, node2};
return ret;
}
void Edge::setNodes(const int in1, const int in2)
{
node1 = in1;
node2 = in2;
}
int Edge::getCost() const
{
return cost;
}
void Edge::setCost(const int len)
{
cost = len;
}
void Edge::swapNodes()
{
node1 = node1 - node2;
node2 += node1;
node1 = node2 - node1;
}
// same as getNodes() above
int Edge::operator[](const int index)
{
if (index == 0) return node1;
else if (index == 1) return node2;
else {
try {throw;}
catch(...) {cout << "edge index must be either 0 or 1" << endl;}
return 1;
}
}
bool Edge::operator==(const Edge& rhs) const
{
if ( (node1 == rhs.getNodes()[0]) && (node2 == rhs.getNodes()[1]) && cost == rhs.getCost() )
return true;
else
return false;
}
bool Edge::operator!=(const Edge& rhs) const
{
if ( !(*this == rhs) )
return true;
else
return false;
}
// Node
//constructors
Node::Node(): label(0), edges(0), score(10000000) {}
Node::Node(const Edge& edg): label(edg.getNodes()[0]), edges(0), score(10000000)
{
edges.push_back(edg);
}
Edge Node::getEdge(const int index) const
{
Edge ret;
list<Edge>::const_iterator iter = edges.begin();
advance(iter, index);
return *iter;
}
void Node::addManyEdges(const vector<Edge> input)
{
for (size_t i = 0; i != input.size(); i++)
{
edges.push_back(input[i]);
}
}
list<Edge>::iterator Node::erase(int index)
{
list<Edge>::iterator iter = edges.begin();
advance(iter, index);
return edges.erase(iter);
}
bool Node::operator==(const Node& rhs) const
{
return label == rhs.getLabel() && equal( edges.begin(), edges.end(), rhs.begin() ); // no need to equate scores
}
bool Node::operator!=(const Node& rhs) const
{
return !(*this == rhs);
}
// Graph
// constructors
Graph::Graph(): nodes(0) {}
Graph::Graph(const string& file_input): nodes(0) // constructor from file
{
string filename(file_input + ".txt");
string line;
ifstream is;
is.open(filename);
int number_nodes; //, number_edges;
is >> number_nodes; // >> number_edges;
nodes.resize(number_nodes); // reserve the Node vector of size 'number_nodes'
int node1, node2, cost;
while (is >> node1 >> node2 >> cost)
{
int nodes_array[2] = {node1, node2};
for (int& node_i : nodes_array) {
if (nodes[node_i - 1].size() == 1)
nodes[node_i - 1].setLabel(node_i);
}
Edge current_edge(node1, node2, cost);
nodes[node1 - 1].addEdge(current_edge);
if (node1 != node2) {
current_edge.swapNodes();
nodes[node2 - 1].addEdge(current_edge);
}
}
is.close();
}
// prints all input nodes
void Graph::output() const
{
for (size_t i = 0; i != nodes.size(); ++i)
{
cout << "Node " << nodes[i].getLabel() << ", size = " << nodes[i].edges.size() << " with edges: ";
for (size_t j = 0; j != nodes[i].edges.size(); ++j)
{
int node_left = nodes[i].getEdge(j).getNodes()[0];
int node_right = nodes[i].getEdge(j).getNodes()[1];
int cost = nodes[i].getEdge(j).getCost();
cout << "[" << node_left << "-" << node_right << ", " << cost << "] ";
}
cout << endl;
}
}
// prints 10 neighbours around picked node
void Graph::output(const int picked_node) const
{
for (int i = picked_node - 5; i != picked_node + 5; ++i)
{
cout << "Node " << nodes[i].getLabel() << ", with edges: ";
for (size_t j = 0; j != nodes[i].edges.size(); ++j)
{
int node_left = nodes[i].getEdge(j).getNodes()[0];
int node_right = nodes[i].getEdge(j).getNodes()[1];
int cost = nodes[i].getEdge(j).getCost();
int score = nodes[node_right - 1].getScore();
cout << "[" << node_left << "-" << node_right << ", " << cost << ", " << score << "] ";
}
cout << endl;
}
}
bool compareCosts(const Edge& a, const Edge& b)
{
return a.getCost() < b.getCost();
}
bool compareScores(const Node& a, const Node& b)
{
return a.getScore() > b.getScore();
}
bool compareLabels(const Node& a, const Node& b)
{
return a.getLabel() > b.getLabel();
}
BFS implementation for Kruskal (alternative to Disjoint Set Kruskal implementation)
BreadthFirstSearch.h
#ifndef BREADTHFIRSTSEARCH_H_INCLUDED
#define BREADTHFIRSTSEARCH_H_INCLUDED
#include <limits>
#include "Graph.h"
int const infinity = std::numeric_limits<int>::infinity();
bool breadthFirstSearch(const Graph&, const int, const int);
#endif // BREADTHFIRSTSEARCH_H_INCLUDED
BreadthFirstSearch.cpp
#include <iostream>
#include <queue>
#include <vector>
#include <map>
#include <algorithm>
#include "BreadthFirstSearch.h"
using std::cout; using std::endl;
using std::cin; using std::find_if;
using std::vector; using std::queue;
using std::map;
bool breadthFirstSearch(const Graph& G, const int start_node_label, const int target_node_label)
{
// define type for explored/unexplored Nodes
enum is_visited {not_visited, visited};
map<int, is_visited> node_is_visited;
for (Graph::const_iterator iter = G.begin(); iter != G.end(); ++iter)
node_is_visited[iter->getLabel()] = not_visited;
Graph::const_iterator start_node_iter =
find_if(G.begin(), G.end(), [=](const Node& i){return i.getLabel() == start_node_label;});
if ( start_node_iter == G.end() )
return false;
node_is_visited[start_node_label] = visited;
Node next_node;
next_node = *start_node_iter;
// breadth-first algorithm runs based on queue structure
queue<Node> Q;
Q.push(next_node);
// BFS algorithm
while (Q.size() != 0) { // out of main loop if all nodes searched->means no path is present
Node current_node = Q.front();
Node linked_node; // variable hosting node on other end
for (size_t i = 0; i != current_node.size(); ++i) { // explore nodes linked to current_node by an edge
int linked_node_label = current_node.getEdge(i).getNodes()[1];
Graph::const_iterator is_linked_node_in_G =
find_if(G.begin(), G.end(), [=](const Node& a){return a.getLabel() == linked_node_label;});
if ( is_linked_node_in_G != G.end() ) { // check linked_node is in G
linked_node = *is_linked_node_in_G; //G_tot.getNode(linked_node_label - 1);
if (node_is_visited[linked_node_label] == not_visited) { // check if linked_node is already in the queue
node_is_visited[linked_node_label] = visited;
Q.push(linked_node); // if not, add it to the queue
// cout << "current " << current_node.getLabel() // for debug
// << " | linked = " << linked_node_label + 1
// << " | path length = " << dist[linked_node_label] << endl;
if (linked_node_label == target_node_label) // end search once target node is explored
return true;
}
} else {
if (linked_node_label == target_node_label) // end search once target node is explored
return false;
}
}
Q.pop();
}
return false;
}
DisjointSet.h
#ifndef DISJOINTSET_H_INCLUDED
#define DISJOINTSET_H_INCLUDED
#include "Graph.h"
class DisjointSet
{
public:
DisjointSet(const size_t);
~DisjointSet();
DisjointSet& operator= (const DisjointSet&);
int find(const Node&);
void unionNodes(const Node&, const Node&);
int get(int index) {return *leaders[index];}
private:
size_t size; // graph size needed for allocation of pointer data members below
int* base; // array of int each Node of the graph has its leader
int** leaders; // array of pointers to int, allows to reassign leaders to Nodes after unions
int find_int(int); // auxiliary to 'find' above
DisjointSet(const DisjointSet&); // copy constructor forbidden
};
#endif // DISJOINTSET_H_INCLUDED
DisjointSet.cpp (here is where advice would be most appreciated)
// Union-find structure (lazy unions)
#include "DisjointSet.h"
DisjointSet::DisjointSet(size_t in): size(in), base(new int[in]), leaders(new int*[in])
{
for (size_t i = 1; i != in + 1; ++i) {
base[i - 1] = i;
leaders[i - 1] = &base[i - 1];
}
}
DisjointSet::~DisjointSet()
{
delete[] base;
delete[] leaders;
}
DisjointSet& DisjointSet::operator= (const DisjointSet& rhs)
{
if (this == &rhs)
return *this; // make sure you aren't self-assigning
if (base != NULL) {
delete[] leaders; // get rid of the old data
delete[] base;
}
// "copy constructor" from here
size = rhs.size;
base = new int[size];
leaders = new int*[size];
base = rhs.base;
for (size_t i = 0; i != size; ++i)
leaders[i] = &base[i];
return *this;
}
// auxiliary to find: implements the recursion
int DisjointSet::find_int(int leader_pos)
{
int parent(leader_pos);
if (leader_pos != *leaders[leader_pos - 1])
parent = find_int(*leaders[leader_pos - 1]);
return parent;
}
// returns leader to input Node
int DisjointSet::find(const Node& input)
{
int parent( input.getLabel() );
if (input.getLabel() != *leaders[input.getLabel() - 1])
parent = find_int(*leaders[input.getLabel() - 1]);
return parent;
}
// merges sets by assigning same leader (the lesser of the two Nodes)
void DisjointSet::unionNodes(const Node& a, const Node& b)
{
if (find(a) != find(b)) {
if (find(a) < find(b))
leaders[find(b) - 1] = &base[find(a) - 1];
else
leaders[find(a) - 1] = &base[find(b) - 1];
}
}
KruskalClustering.h
#ifndef KRUSKALCLUSTERING_H_INCLUDED
#define KRUSKALCLUSTERING_H_INCLUDED
#include <vector>
#include "Graph.h"
#include "DisjointSet.h"
int clusteringKruskalNaive(const Graph&, const std::vector<Edge>&, int);
int clusteringKruskalDisjointSet(const Graph& graph0, const std::vector<Edge>& edges, int);
#endif // KRUSKALCLUSTERING_H_INCLUDED
KruskalClustering.cpp
// Kruskal MST algorithm. Implementation specific to (k=4)-clustering
// -naive (with breadth-first-search to check whether new edge creates a cycle); cost: O(#_edges * #_nodes)
// -and union-find implementations. Cost: O(#_edges*log2(#_nodes))
#include <iostream>
#include <string>
#include <vector>
#include <algorithm> //std::find_if
#include "BreadthFirstSearch.h"
#include "KruskalClustering.h"
using std::cout; using std::endl;
using std::string; using std::vector;
using std::find_if;
int clusteringKruskalNaive(const Graph& graph0, const vector<Edge>& edges, int k)
{
Graph T; // Minimum Spanning Tree
vector<Edge>::const_iterator edges_iter = edges.begin();
int sum_costs(0), number_of_clusters( graph0.size() ); // keep track of overall cost of edges in T, and of clusters
while (number_of_clusters >= k) {
// find out if first node of edge is already in T
Graph::iterator is1_in_T = find_if(T.begin(), T.end(),
[=] (Node& a) {return a.getLabel() == graph0.getNode(edges_iter->getNodes()[0] - 1).getLabel();});
bool is1_in_T_flag; // needed because T gets increased and thus invalidates iterator is1_in_T
Node* node_1 = new Node(*edges_iter); // no use of pointer here, it creates a new Node anyway, can't move Nodes to T
if ( is1_in_T == T.end() ) { // node_1 not in T so we add it
T.addNode(*node_1);
number_of_clusters--; // node_1 is not its own cluster any more
delete node_1; // node_1 copied to T so ...
node_1 = &(T[T.size() - 1]); // ... it now points there
sum_costs += (*edges_iter).getCost();
is1_in_T_flag = false;
} else { // node_1 is in T already
delete node_1; // if so, just update the pointer
node_1 = &(*is1_in_T);
is1_in_T_flag = true;
}
// perform BFS to check for cycles
bool check_cycles = breadthFirstSearch(T, edges_iter->getNodes()[0], edges_iter->getNodes()[1]);
// create an identical edge, but with nodes positions swapped
Edge swapped_edge = *edges_iter;
swapped_edge.swapNodes();
// find out if second node of edge is already in T
Graph::iterator is2_in_T = find_if( T.begin(), T.end(),
[=] (Node& a) {return a.getLabel() == graph0.getNode(edges_iter->getNodes()[1] - 1).getLabel();});
// (either node1 or 2 not in T, or both, or both present but in different clusters of T)
if (!check_cycles) { // if edges_iter creates no cycle when added to T
if (is1_in_T_flag){ // if node_1 was already present in T
(*node_1).addEdge(*edges_iter); // just add new edge to node_1 list of edges
sum_costs += (*edges_iter).getCost();
} else
number_of_clusters++; // if node_1 not present, it means number_of_cl was decreased above ...
number_of_clusters--; // ... and number_of_cl can decrease just by one, if adding an edge to T
if ( is2_in_T != T.end() ) // node_2 already in T: just update its list of edges
(*is2_in_T).addEdge(swapped_edge);
else { // node_2 not in T, so add it
Node node_2(swapped_edge);
T.addNode(node_2);
}
} else { // cycle created by *edges_iter
if (!is1_in_T_flag) // in case the cycle happened just after adding node_1:
(*is2_in_T).addEdge(swapped_edge); // add edge to node_2, num_clusters already updated by node_1
}
if (number_of_clusters >= k) // advance to next Edge if num_clusters > k
edges_iter++;
// debug
// T.output();
// cout << "next edge: (" << (*edges_iter).getNodes()[0] << "-"
// << (*edges_iter).getNodes()[1] << ") " << endl;
// cout << "clustering: " << number_of_clusters << endl;
}
cout << "Sum of MST lengths is: " << sum_costs << endl;
return (*edges_iter).getCost();
}
// same algorithm, implemented with Union-find structure
int clusteringKruskalDisjointSet(const Graph& graph0, const vector<Edge>& edges, int k)
{
DisjointSet disjoint_set_graph0( graph0.size() ); // create Union-find structure
Graph T;
vector<Edge>::const_iterator edges_iter = edges.begin();
int sum_costs(0), number_of_clusters( graph0.size() );
while ( number_of_clusters >= k ) {
// if nodes in Edge have not the same leader in the disjoint set, then no loop is created, and T can add the edge
if ( disjoint_set_graph0.find(graph0.getNode(edges_iter->getNodes()[0] - 1)) !=
disjoint_set_graph0.find(graph0.getNode(edges_iter->getNodes()[1] - 1)) ) {
sum_costs += (*edges_iter).getCost();
number_of_clusters--; // no cycle created so the edge will be added to T
// look for node_1 in T
Graph::iterator is1_in_T = find_if(T.begin(), T.end(),
[=] (Node& a) {return a.getLabel() == graph0.getNode(edges_iter->getNodes()[0] - 1).getLabel();});
if ( is1_in_T == T.end() ) { // if node_1 not in T add it
Node node1(*edges_iter);
T.addNode(node1);
} else // if node_1 already in T only add to it this edge
(*is1_in_T).addEdge(*edges_iter);
Edge swapped_edge = *edges_iter;
swapped_edge.swapNodes();
// look for node_2 in T
Graph::iterator is2_in_T = find_if(T.begin(), T.end(),
[=] (Node& a) {return a.getLabel() == graph0.getNode(edges_iter->getNodes()[1] - 1).getLabel();});
if ( is2_in_T == T.end() ) { // same as for node_1
Node node2(swapped_edge);
T.addNode(node2);
} else
(*is2_in_T).addEdge(swapped_edge);
// merge the 2 nodes' sets: update their disjointed set leaders
disjoint_set_graph0.unionNodes( graph0.getNode(edges_iter->getNodes()[0] - 1),
graph0.getNode(edges_iter->getNodes()[1] - 1) );
}
if (number_of_clusters >= 4)
edges_iter++;
//debug
// T.output();
// cout << "next edge: (" << (*edges_iter).getNodes()[0] << "-"
// << (*edges_iter).getNodes()[1] << ") " << endl;
// cout << "clustering: " << number_of_clusters << endl;
// for (size_t i = 0; i != graph0.size(); ++i)
// cout << disjoint_set_graph0.get(i) << " ";
// cout << endl;
}
cout << "Sum of MST lengths is: " << sum_costs << endl;
return (*edges_iter).getCost();
}
Lastly, main.cpp :
/* A max-spacing k-clustering program based on Kruskal MST algorithm.
The input file lists a complete graph with edge costs.
clusters k = 4 assumed.*/
#include <iostream>
#include <fstream>
#include <string>
#include <vector>
#include <algorithm> // std::sort;
#include "Graph.h"
#include "KruskalClustering.h"
using std::cout; using std::endl;
using std::string; using std::ifstream;
using std::vector; using std::sort;
int main(int argc, char** argv)
{
cout << "Reading list of edges from input file ...\n" << endl;
// read graph0 from input file
string filename(argv[1]);
Graph graph0(filename);
// graph0.output(); // debug
cout << endl;
// re-read input file and create a list of all edges in graph0
ifstream is(filename + ".txt");
int nodes_size;
is >> nodes_size;
vector<Edge> edges;
int node1, node2, cost;
while (is >> node1 >> node2 >> cost) {
Edge current_edge(node1, node2, cost);
edges.push_back(current_edge);
}
is.close();
// sort the edge list by increasing cost
cout << "Sorting edges ...\n" << endl;
sort(edges.begin(), edges.end(), compareCosts);
// for (vector<Edge>::iterator iter = edges.begin(); iter != edges.end(); ++iter) // debug
// cout << (*iter).getNodes()[0] << " " << (*iter).getNodes()[1] << " " << (*iter).getCost() << endl;
cout << "Kruskal algorithm: Computing minimal distance between clusters when they are reduced to 4 ...\n" << endl;
int k = 4; // number of clusters desired
// pick implementation, comment the other
//int clustering_min_dist = clusteringKruskalNaive(graph0, edges, k);
int clustering_min_dist = clusteringKruskalDisjointSet(graph0, edges, k);
cout << "k = " << k << " clusters minimal distance is: " << clustering_min_dist << endl;
return 0;
}