Source code for sortedfile

#
# Copyright 2012, David Wilson
# 
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# 
#    http://www.apache.org/licenses/LICENSE-2.0
# 
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

"""
Efficient seeking within sorted text files. Works by implementing in-place
binary search on the lines of the file, with a small hack to handle the first
line of the file.

See accompanying documentation for more information.
http://sortedfile.readthedocs.org/
"""

# If user specifies the exact start of a line using the `lo` parameter, due to
# logic below we always skip the first read substring after a seek. Therefore
# we subtract one from `lo` if it is provided to the bisect() functions to
# ensure the user's full intended line is seen.

import functools
import itertools
import os

try:
    from mmap import mmap as _mmap
except ImportError:
    _mmap = None


[docs]def getsize(fp): """Return the size of `fp` if it is a physical file, ``StringIO``, or ``mmap.mmap``, otherwise raise ValueError.""" if hasattr(fp, 'getvalue'): return len(fp.getvalue()) elif _mmap and isinstance(fp, _mmap): return len(fp) elif hasattr(fp, 'name') and os.path.exists(fp.name): return os.path.getsize(fp.name) else: raise ValueError("can't get size of %r" % (fp,))
[docs]def warm(fp, lo=None, hi=None): """Encourage the seekable file `fp` to become cached by reading from it sequentially.""" lo = lo or 0 hi = hi or getsize(fp) fp.seek(lo) s = ' ' while s and hi >= 0: s = fp.read(10485760) hi -= len(s) if s else hi
[docs]def bisect_seek_left(fp, x, lo=None, hi=None, key=None): """Position the sorted seekable file `fp` such that all preceding lines are less than `x`. If `x` is present, the file is positioned on its first occurrence.""" lo = (lo - 1) if lo else 0 hi = hi or getsize(fp) key = key or (lambda s: s) while lo < hi: mid = (lo + hi) // 2 fp.seek(mid) if mid: fp.readline() s = fp.readline() if s and key(s) < x: lo = mid + 1 else: hi = mid fp.seek(lo) if lo: fp.readline()
[docs]def bisect_seek_right(fp, x, lo=None, hi=None, key=None): """Position the sorted seekable file `fp` such that all subsequent lines are greater than `x`. If `x` is present, the file is positioned past its last occurrence.""" lo = (lo - 1) if lo else 0 hi = hi or getsize(fp) key = key or (lambda s: s) while lo < hi: mid = (lo + hi) // 2 fp.seek(mid) if mid: fp.readline() s = fp.readline() if s and x < key(s): hi = mid else: lo = mid + 1 fp.seek(lo) if lo: fp.readline()
[docs]def bisect_seek_fixed_left(fp, n, x, lo=None, hi=None, key=None): """Position the sorted seekable file `fp` such that all preceding `n` byte records are less than `x`. If `x` is present, the file is positioned on its first occurrence.""" lo = lo or 0 key = key or (lambda s: s) rlo = lo / n rhi = (hi or getsize(fp)) / n while rlo < rhi: mid = (rlo + rhi) // 2 fp.seek(lo + (mid * n)) s = fp.read(n) if s and key(s) < x: rlo = mid + 1 else: rhi = mid fp.seek(lo + (rlo * n))
[docs]def bisect_seek_fixed_right(fp, n, x, lo=None, hi=None, key=None): """Position the sorted seekable file `fp` such that all subsequent `n` byte records are greater than `x`. If `x` is present, the file is positioned past its last occurrence.""" lo = lo or 0 key = key or (lambda s: s) rlo = lo / n rhi = (hi or getsize(fp)) / n while rlo < rhi: mid = (rlo + rhi) // 2 fp.seek(lo + (mid * n)) s = fp.read(n) if s and x < key(s): rhi = mid else: rlo = mid + 1 fp.seek(lo + (rlo * n))
[docs]def bisect_func_left(x, lo, hi, func): """Bisect `func(i)`, returning an index such that preceding values are less than `x`. If `x` is present, the returned index is its first occurrence. EOF is assumed if `func` returns None.""" while lo < hi: mid = (lo + hi) // 2 k = func(mid) if k is not None and k < x: lo = mid + 1 else: hi = mid return lo, k
[docs]def bisect_func_right(x, lo, hi, func): """Bisect `func(i)`, returning an index such that consecutive values are greater than `x`. If `x` is present, the returned index is past its last occurrence. EOF is assumed if `func` returns None.""" while lo < hi: mid = (lo + hi) // 2 k = func(mid) if k is not None and x < k: hi = mid else: lo = mid + 1 return lo, k
[docs]def extents(fp, lo=None, hi=None): """Return a tuple of the first and last lines from the seekable file `fp`.""" lo = (lo - 1) if lo else 0 hi = hi or getsize(fp) bisect_seek_left(fp, '', lo, hi) low = fp.readline() for offset in xrange(0, 1048576, 4096): fp.seek(hi - offset) _, sep, high = fp.read(offset - 1).rstrip('\n').rpartition('\n') if sep: return low, high
[docs]def extents_fixed(fp, n, lo=None, hi=None): """Return a tuple of the first and last `n` byte records from the seekable file `fp`.""" lo = lo or 0 hi = hi or getsize(fp) bisect_seek_fixed_left(fp, n, '', lo, hi) low = fp.read(n) recs = (hi - lo) // n fp.seek(lo + (n * (recs - 1))) return low, fp.read(n)
[docs]def iter_inclusive(fp, x, y, lo=None, hi=None, key=None): """Iterate lines of the sorted seekable file `fp` satisfying `x <= line <= y`.""" key = key or (lambda s: s) bisect_seek_left(fp, x, lo, hi, key) pred = lambda s: x <= key(s) <= y return itertools.takewhile(pred, iter(fp.readline, ''))
[docs]def iter_exclusive(fp, x, y, lo=None, hi=None, key=None): """Iterate lines of the sorted seekable file `fp` satisfying `x < line < y`.""" key = key or (lambda s: s) bisect_seek_right(fp, x, lo, hi, key) pred = lambda s: x < key(s) < y return itertools.takewhile(pred, iter(fp.readline, ''))
[docs]def iter_fixed_inclusive(fp, n, x, y, lo=None, hi=None, key=None): """Iterate `n` byte records of the sorted seekable file `fp` satisfying `x <= record <= y`.""" key = key or (lambda s: s) bisect_seek_fixed_left(fp, n, x, lo, hi, key) pred = lambda s: x <= key(s) <= y return itertools.takewhile(pred, iter(functools.partial(fp.read, n), ''))
[docs]def iter_fixed_exclusive(fp, n, x, y, lo=None, hi=None, key=None): """Iterate `n` byte records of the sorted seekable file `fp` satisfying `x < record < y`.""" key = key or (lambda s: s) bisect_seek_fixed_right(fp, n, x, lo, hi, key) pred = lambda s: x < key(s) < y return itertools.takewhile(pred, iter(functools.partial(fp.read, n), ''))