Cost-coupled: classification via Logistic Regression

For the cost-coupled set-up, we consider a classification scenario [NedOz09]. In this example, a linear model is trained by minimizing the so-called logistic loss functions. The complete code of this example is given at the end of this page.

Problem formulation

Suppose there are \(N\) agents, where each agent \(i\) is equipped with \(m_i\) points \(p_{i,1}, \ldots, p_{i,m_i} \in \mathbb{R}^d\) (which represent training samples in a \(d\)-dimensional feature space). Moreover, suppose the points are associated to binary labels, that is, each point \(p_{i,j}\) is labeled with \(\ell_{i,j} \in \{-1,1\}\), for all \(j \in \{1, \ldots, m_i\}\) and \(i \in \{1, \ldots, N\}\). The problem consists of building a linear classification model from the training samples by maximizing the a-posteriori probability of each class. In particular, we look for a separating hyperplane of the form \(\{ z \in \mathbb{R}^d \mid w^\top z + b = 0 \}\), whose parameters \(w\) and \(b\) can be determined by solving the convex optimization problem

\[\min_{w, b} \: \sum_{i=1}^N \: \sum_{j=1}^{m_i} \log \left[ 1 + e^{ -(w^\top p_{i,j} + b) \ell_{i,j} } \right] + \frac{C}{2} \|w\|^2,\]

where \(C > 0\) is a parameter affecting regularization. Notice that the problem is cost coupled (refer to the general formulation), with each local cost function \(f_i\) given by

\[f_i(w, b) = \sum_{j=1}^{m_i} \log \left[ 1 + e^{ -(w^\top p_{i,j} + b) \ell_{i,j} } \right] + \frac{C}{2N} \|w\|^2, \hspace{1cm} i \in \{1, \ldots, N\}.\]

The goal is to make agents agree on a common solution \((w^\star, b^\star)\), so that all of them can compute the separating hyperplane as \(\{ z \in \mathbb{R}^d \mid (w^\star)^\top z + b^\star = 0 \}\).

Data generation model

We consider a bidimensional sample space (\(d = 2\)). Agents generate a certain number of samples (between 2 and 5) for both labels. For each label, the samples are drawn according to a multivariate gaussian distribution, with covariance matrix equal to the identity and mean equal to \((0,0)\) (for the label \(1\)) and \((3,2)\) (for the label \(-1\)). The regularization parameter is set to \(C = 10\).

Complete code

examples/setups/logistic_regression/launcher.py
###############################################################
# COST-COUPLED Example
# Logistic Regression for Classification
#
# Each agent has a certain number of randomly generated points, labeled 1 or -1.
# The points are generated by agents according to a multivariate normal distribution,
# with different mean and covariance for the two labels.
###############################################################
# Compared Algorithms:
#
# - Distributed Subgradient
# - Gradient Tracking
###############################################################

import dill as pickle
import numpy as np
from mpi4py import MPI
from disropt.agents import Agent
from disropt.algorithms import SubgradientMethod, GradientTracking, DualDecomposition
from disropt.functions import Variable, SquaredNorm, Logistic
from disropt.utils.graph_constructor import binomial_random_graph, metropolis_hastings
from disropt.problems import Problem

# get MPI info
NN = MPI.COMM_WORLD.Get_size()
agent_id = MPI.COMM_WORLD.Get_rank()

# Generate a common graph (everyone uses the same seed)
Adj = binomial_random_graph(NN, p=0.3, seed=1)
W = metropolis_hastings(Adj)

np.random.seed(10*agent_id)

#####################
# Problem parameters
#####################

# parameters of gaussians
mu = (np.array([0, 0]).transpose(), np.array([3, 2]).transpose())
sigma = (np.eye(2), np.eye(2))

dim = mu[0].shape[0]  # dimension of sample space

# number of samples (min 2 max 5 for each label)
nsamp = (np.random.randint(2, 6), np.random.randint(2, 6))

# regularization parameter
C = 10

#####################
# Generate problem data
#####################

# points
points = np.zeros((dim, nsamp[0]+nsamp[1]))
points[:, 0:nsamp[0]] = np.random.multivariate_normal(mu[0], sigma[0], nsamp[0]).transpose()
points[:, nsamp[0]:] = np.random.multivariate_normal(mu[1], sigma[1], nsamp[1]).transpose()

# labels
labels = np.ones((sum(nsamp), 1))
labels[nsamp[0]:] = -labels[nsamp[0]:]

# cost function
z = Variable(dim+1)
A = np.ones((dim+1, 1))
A[-1] = 0
obj_func = (C / (2 * NN)) * SquaredNorm(A @ z)

for j in range(sum(nsamp)):
    e_j = np.zeros((sum(nsamp), 1))
    e_j[j] = 1
    A_j = np.vstack((points @ e_j, 1))
    obj_func += Logistic(- labels[j] * A_j @ z)

#####################
# Distributed algorithms
#####################

# local agent and problem
agent = Agent(
    in_neighbors=np.nonzero(Adj[agent_id, :])[0].tolist(),
    out_neighbors=np.nonzero(Adj[:, agent_id])[0].tolist(),
    in_weights=W[agent_id, :].tolist())
pb = Problem(obj_func)
agent.set_problem(pb)

# instantiate the algorithms
x0 = 5*np.random.rand(dim+1, 1)

subgr = SubgradientMethod(agent=agent,
                          initial_condition=x0,
                          enable_log=True)

gradtr = GradientTracking(agent=agent,
                          initial_condition=x0,
                          enable_log=True)

def step_gen(k): 
    return 1/((k+1)**0.6)

constant_stepsize = 0.001
num_iterations    = 20000

# run the algorithms
subgr_seq      = subgr.run(iterations=num_iterations, stepsize=step_gen)
gradtr_seq     = gradtr.run(iterations=num_iterations, stepsize=constant_stepsize)

# print results
print("Subgradient method: agent {}: {}".format(agent_id, subgr.get_result().flatten()))
print("Gradient tracking: agent {}: {}".format(agent_id, gradtr.get_result().flatten()))

# save information
if agent_id == 0:
    with open('info.pkl', 'wb') as output:
        pickle.dump({'N': NN, 'size': dim+1, 'iterations': num_iterations}, output, pickle.HIGHEST_PROTOCOL)

with open('agent_{}_func.pkl'.format(agent_id), 'wb') as output:
    pickle.dump(obj_func, output, pickle.HIGHEST_PROTOCOL)

np.save("agent_{}_seq_subgr.npy".format(agent_id), np.squeeze(subgr_seq))
np.save("agent_{}_seq_gradtr.npy".format(agent_id), np.squeeze(gradtr_seq))
examples/setups/logistic_regression/results.py
import numpy as np
import matplotlib.pyplot as plt
from disropt.problems import Problem
import pickle

# initialize
with open('info.pkl', 'rb') as inp:
    info = pickle.load(inp)
NN    = info['N']
iters = info['iterations']
size  = info['size']

# load agent data
seq_subgr   = np.zeros((NN, iters, size))
seq_gradtr  = np.zeros((NN, iters, size))
local_function = {}
for i in range(NN):
    seq_subgr[i,:,:]   = np.load("agent_{}_seq_subgr.npy".format(i))
    seq_gradtr[i,:,:]  = np.load("agent_{}_seq_gradtr.npy".format(i))
    with open('agent_{}_func.pkl'.format(i), 'rb') as inp:
        local_function[i] = pickle.load(inp)

# solve centralized problem
global_obj_func = 0
for i in range(NN):
    global_obj_func += local_function[i]

global_pb = Problem(global_obj_func)
x_centr = global_pb.solve()
cost_centr = global_obj_func.eval(x_centr)
x_centr = x_centr.flatten()

# compute cost errors
cost_err_subgr   = np.zeros((NN, iters))
cost_err_gradtr  = np.zeros((NN, iters))

for i in range(NN):
    for t in range(iters):
        # first compute global function value at local point
        cost_ii_tt_subgr   = 0
        cost_ii_tt_gradtr  = 0
        for j in range(NN):
            cost_ii_tt_subgr   += local_function[j].eval(seq_subgr[i, t, :][:, None])
            cost_ii_tt_gradtr  += local_function[j].eval(seq_gradtr[i, t, :][:, None])
        
        # then compute errors
        cost_err_subgr[i, t]   = abs(cost_ii_tt_subgr - cost_centr)
        cost_err_gradtr[i, t]  = abs(cost_ii_tt_gradtr - cost_centr)

# plot maximum cost error
plt.figure()
plt.title('Maximum cost error (among agents)')
plt.xlabel(r"iteration $k$")
plt.ylabel(r"$\max_{i} \: \left|(\sum_{j=1}^N f_j(x_i^k) - f^\star)/f^\star \right|$")
plt.semilogy(np.arange(iters), np.amax(cost_err_subgr/cost_centr, axis=0), label='Distributed Subgradient')
plt.semilogy(np.arange(iters), np.amax(cost_err_gradtr/cost_centr, axis=0), label='Gradient Tracking')
plt.legend()

# plot maximum solution error
sol_err_subgr   = np.linalg.norm(seq_subgr - x_centr[None, None, :], axis=2)
sol_err_gradtr  = np.linalg.norm(seq_gradtr - x_centr[None, None, :], axis=2)

plt.figure()
plt.title('Maximum solution error (among agents)')
plt.xlabel(r"iteration $k$")
plt.ylabel(r"$\max_{i} \: \|x_i^k - x^\star\|$")
plt.semilogy(np.arange(iters), np.amax(sol_err_subgr, axis=0), label='Distributed Subgradient')
plt.semilogy(np.arange(iters), np.amax(sol_err_gradtr, axis=0), label='Gradient Tracking')
plt.legend()

plt.show()

The two files can be executed by issuing the following commands in the example folder:

> mpirun -np 20 --oversubscribe python launcher.py
> python results.py

References

NedOz09

Nedic, Angelia; Asuman Ozdaglar: Distributed subgradient methods for multi-agent optimization: IEEE Transactions on Automatic Control 54.1 (2009): 48.