Skip to content
164 changes: 164 additions & 0 deletions vb_suite/extras_indexing.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,164 @@
#----------------------------------------------------------------------
# Thorough checks of all containers and all indexing types

from vbench.benchmark import Benchmark

SECTION = 'Exhaustive check of indexing and scalar value access'

common_setup = """from pandas_vb_common import *
"""


import pandas.util.testing as tm

MAX_ENTRIES = 100000

# FIXME: makeCustomIndexWithCache reimplements (sort of) tm.makeCustomIndex,
# because the latter doesn't offer customization of date/period index
# frequencies and integer index offset.

setup_template = common_setup + """
import sys
import pandas as pd

try:
make_index = tm.makeCustomIndexWithCache
except AttributeError:
MAX_ENTRIES = %(MAX_ENTRIES)s
_indices = {}

def makeCustomIndexWithCache(nentries, idx_type):
assert nentries <= MAX_ENTRIES

key = idx_type
try:
full_idx = _indices[key]
except KeyError:
if idx_type == 'mi':
full_idx = tm.makeCustomIndex(nentries=MAX_ENTRIES, nlevels=2)
elif idx_type == 'dt':
full_idx = pd.date_range('2000-01-01', periods=MAX_ENTRIES, freq='T')
elif idx_type == 'p':
full_idx = pd.period_range('2000-01-01', periods=MAX_ENTRIES, freq='T')
elif idx_type == 's':
full_idx = tm.makeStringIndex(k=MAX_ENTRIES)
elif idx_type == 'u':
full_idx = tm.makeUnicodeIndex(k=MAX_ENTRIES)
elif idx_type == 'i':
full_idx = pd.Index(np.arange(MAX_ENTRIES) + MAX_ENTRIES)
elif idx_type == 'f':
full_idx = tm.makeFloatIndex(MAX_ENTRIES)
else:
raise ValueError('Wrong idx type: %%s' %% idx_type)

_indices[key] = full_idx

return full_idx[:nentries]

make_index = tm.makeCustomIndexWithCache = makeCustomIndexWithCache

obj = %(class_name)s(%(ctor_args)s)

pos = -1
axis = obj._get_axis(%(axis)r)
label = axis[pos]
arr_pos = np.arange(int(len(axis) / 2))
arr_label = axis[arr_pos].values
mask = tm.np.arange(len(axis)) %% 3 == 0
series_mask = Series(mask)
"""

# generate_index_benchmarks(
# klass, long_axis=axis, idx_type=idx_type, is_dup=is_dup)


def generate_index_benchmarks(klass, idx_type, long_axis):
ndim = klass().ndim

shape = [10] * ndim
shape[long_axis] = MAX_ENTRIES
shape = tuple(shape)

types = ['i'] * ndim
types[long_axis] = idx_type
types = tuple(types)

axes = klass._AXIS_ORDERS
ctor_args = ',\n '.join([
'%s=make_index(nentries=%r, idx_type=%r)' % v
for v in zip(axes, shape, types)])

def get_benchmark_name(indexer, axis):
shape_type_str = 'x'.join([str(s) + str(t)
for s, t in zip(shape, types)])

components = ['indexing_', klass.__name__.lower(), indexer,
shape_type_str]
if axis is not None:
components.append("ax%s" % axis)

return '_'.join(components)

def make_suffix(attrname, indexer_str, axis):
if axis is not None:
indexers = [':,'] * ndim
indexers[axis] = indexer_str + ','
indexer_str = ''.join(indexers)
return '%s[%s]' % (attrname, indexer_str)

benchmarked_axes = set([None, 0, ndim - 1])

result = {}
for axis in benchmarked_axes:
for params in [
{'indexer': 'basic_pos',
'suffix': make_suffix('.iloc', 'pos', axis)},
{'indexer': 'basic_label',
'suffix': make_suffix('.loc', 'label', axis)},

{'indexer': 'slice_pos',
'suffix': make_suffix('.iloc', ':pos', axis)},
{'indexer': 'slice_label',
'suffix': make_suffix('.loc', ':label', axis)},

{'indexer': 'arr_pos',
'suffix': make_suffix('.iloc', 'arr_pos', axis)},
{'indexer': 'arr_label',
'suffix': make_suffix('.loc', 'arr_label', axis)},

{'indexer': 'iloc_mask',
'suffix': make_suffix('.iloc', 'mask', axis)},
{'indexer': 'loc_mask',
'suffix': make_suffix('.loc', 'mask', axis)}, ]:

b = Benchmark('obj%s' % params['suffix'],
setup_template % {
'class_name': klass.__name__,
'ctor_args': ctor_args, 'axis': axis or 0,
'MAX_ENTRIES': MAX_ENTRIES},
name=get_benchmark_name(params['indexer'], axis))
result[b.name] = b

return result

# Benchmarks are generated as follows: given a container type, generate an
# instance of it with one of the axes long enough to produce statistically
# significant timing values and try different kinds of indexing on it.
#
# Generated benchmark set involves a cartesian product of
# - container types
# - designated "long" axis (minor or major one)
# - "long" axis type (string, integer, datetime, period, multiindex)
# - indexer type (positional, slice, fancy, etc.)
# - indexer axis (indexing is not limited to "long" axis)
# - label/positional indexer
#
# FIXME: add multiindex indexers?
# FIXME: add non-unique axes?
# FIXME: add non-unique non-monotonic axes?
for klass in (tm.Series, tm.DataFrame, tm.Panel):
for axis in set([0, klass().ndim - 1]):
for idx_type in ('s', 'i', 'dt', 'p', 'mi'):
bms = generate_index_benchmarks(
klass, long_axis=axis, idx_type=idx_type)
globals().update(bms)
120 changes: 120 additions & 0 deletions vb_suite/indexing_exhaustive.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,120 @@
#----------------------------------------------------------------------
# Thorough checks of all containers and all indexing types

from vbench.benchmark import Benchmark

SECTION = 'Exhaustive check of indexing and scalar value access'

common_setup = """from pandas_vb_common import *
"""


import pandas.util.testing as tm

setup_template = common_setup + """
import sys

try:
make_index = tm.makeCustomIndexWithCache
except AttributeError:
MAX_ENTRIES = 1000000
_indices = {}

def makeCustomIndexWithCache(nentries, **kwargs):
assert nentries < MAX_ENTRIES

key = tuple(kwargs.items())
try:
full_idx = _indices[key]
except KeyError:
full_idx = _indices[key] = tm.makeCustomIndex(nentries=MAX_ENTRIES,
**kwargs)
return full_idx[:nentries]

make_index = tm.makeCustomIndexWithCache = makeCustomIndexWithCache

obj = %(class_name)s(%(ctor_args)s)

pos = -1
axis = obj._get_axis(%(axis)r)
label = axis[pos]
arr_pos = np.arange(int(len(axis) / 2))
arr_label = axis[arr_pos].values
mask = tm.np.arange(len(axis)) %% 3 == 0
series_mask = Series(mask)
"""


def generate_index_benchmarks(klass, idx_type, shape):
if not isinstance(shape, tuple):
shape = (shape,)
ndim = len(shape)

if not isinstance(idx_type, tuple):
idx_types = tuple([idx_type] * ndim)
else:
assert len(idx_type) == ndim
idx_types = idx_type

axes = klass._AXIS_ORDERS
ctor_args = ',\n '.join([
'%s=make_index(idx_type=%r, nentries=%s, nlevels=1)' % v
for v in zip(axes, idx_types, shape)])

def get_benchmark_name(indexer, axis):
shape_type_str = 'x'.join([str(s) + str(t)
for s, t in zip(shape, idx_types)])

components = ['indexing_', klass.__name__.lower(), indexer,
shape_type_str]
if axis is not None:
components.append("ax%s" % axis)

return '_'.join(components)

def make_suffix(attrname, indexer_str, axis):
if axis is not None:
indexers = [':,'] * ndim
indexers[axis] = indexer_str + ','
indexer_str = ''.join(indexers)
return '%s[%s]' % (attrname, indexer_str)

benchmarked_axes = set([None, 0, ndim - 1])

result = {}
for axis in benchmarked_axes:
for params in [
{'indexer': 'basic_pos',
'suffix': make_suffix('.iloc', 'pos', axis)},
{'indexer': 'basic_label',
'suffix': make_suffix('.loc', 'label', axis)},

{'indexer': 'slice_pos',
'suffix': make_suffix('.iloc', ':pos', axis)},
{'indexer': 'slice_label',
'suffix': make_suffix('.loc', ':label', axis)},

{'indexer': 'arr_pos',
'suffix': make_suffix('.iloc', 'arr_pos', axis)},
{'indexer': 'arr_label',
'suffix': make_suffix('.loc', 'arr_label', axis)},

{'indexer': 'iloc_mask',
'suffix': make_suffix('.iloc', 'mask', axis)},
{'indexer': 'loc_mask',
'suffix': make_suffix('.loc', 'mask', axis)}, ]:

b = Benchmark('obj%s' % params['suffix'],
setup_template % {
'class_name': klass.__name__,
'ctor_args': ctor_args, 'axis': axis or 0},
name=get_benchmark_name(params['indexer'], axis))
result[b.name] = b

return result

globals().update(generate_index_benchmarks(tm.Series, 's', 100000))
globals().update(generate_index_benchmarks(tm.DataFrame, 's', (10, 100000)))
globals().update(generate_index_benchmarks(tm.DataFrame, 's', (100000, 10)))
globals().update(generate_index_benchmarks(tm.Panel, 's', (100000, 10, 10)))
globals().update(generate_index_benchmarks(tm.Panel, 's', (10, 10, 100000)))
41 changes: 33 additions & 8 deletions vb_suite/suite.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,17 +30,42 @@
'timedelta',
'eval']

by_module = {}
benchmarks = []

for modname in modules:
ref = __import__(modname)
by_module[modname] = [v for v in ref.__dict__.values()
def discover_benchmarks(mods, return_as='list'):
"""
Collect available benchmarks from specified modules.

Arguments
---------
mods: list of str
List of modules to search in
return_as: {'both', 'list', 'dict'}
Specifies result type: dict will group benchmarks by module
"""
by_module = {}
benchmarks = []

for modname in mods:
ref = __import__(modname)
mod_benchmarks = [v for v in ref.__dict__.values()
if isinstance(v, Benchmark)]
benchmarks.extend(by_module[modname])

for bm in benchmarks:
assert(bm.name is not None)
for bm in mod_benchmarks:
assert bm.name is not None

by_module[modname] = mod_benchmarks
benchmarks.extend(mod_benchmarks)

if return_as == 'both':
return by_module, benchmarks
elif return_as == 'list':
return benchmarks
elif return_as == 'dict':
return by_module
else:
raise ValueError("Incorrect return_as value: %s" % return_as)

by_module, benchmarks = discover_benchmarks(modules, return_as='both')

import getpass
import sys
Expand Down
16 changes: 12 additions & 4 deletions vb_suite/test_perf.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,9 @@ def __call__(self, parser, namespace, values, option_string=None):
dest='regex',
default="",
help='Regex pat, only tests whose name matches the regext will be run.')
parser.add_argument('-e', '--extra-benchmarks', metavar='EXTRA',
dest='extras', action='append',
help='Extra modules to collect benchmarks from')
parser.add_argument('-s', '--seed',
metavar="SEED",
dest='seed',
Expand Down Expand Up @@ -442,6 +445,7 @@ def print_report(df,h_head=None,h_msg="",h_baseline=None,b_msg=""):
if args.stats :
try:
pd.options.display.expand_frame_repr=False
pd.set_option('display.max_rows', None)
except:
pass
stats_footer += str(df.T.describe().T) + "\n\n"
Expand All @@ -461,10 +465,7 @@ def print_report(df,h_head=None,h_msg="",h_baseline=None,b_msg=""):
args.log_file)



def main():
from suite import benchmarks

if not args.log_file:
args.log_file = os.path.abspath(
os.path.join(REPO_PATH, 'vb_suite.log'))
Expand Down Expand Up @@ -509,7 +510,14 @@ def main():
# surprises
os.chdir(os.path.dirname(os.path.abspath(__file__)))

benchmarks = [x for x in benchmarks if re.search(args.regex,x.name)]
from suite import discover_benchmarks, benchmarks

benchmarks = [b for b in benchmarks]
if args.extras:
benchmarks.extend(discover_benchmarks(args.extras, return_as='list'))

benchmarks = [bm for bm in benchmarks
if re.search(args.regex, bm.name)]

for b in benchmarks:
b.repeat = args.repeats
Expand Down