当前位置:   article > 正文

python bisect 二分算法工具以及应用

python bisect 二分算法工具以及应用

python bisect 工具以及应用
主要用于在顺序固定的序列中查找以及插入

内置了四种方法

  • bisect_left
  • bisect_right
  • insort_right
  • insort_left

bisect_left/right 方法找到应该插入元素的位置,对于和序列中元素不相同的值,两个方法返回的一样,对于相同的值,left返回相同值的位置,right返回相同值下一个位置
例如在ps = [1, 3, 5, 9, 9, 100]
query = 9
的话,left返回3,right返回5

import bisect
import pandas as pd
import numpy as np
from functools import reduce
from typing import List

import tensorflow as tf
import torch
from sklearn.preprocessing import KBinsDiscretizer


# Bisection algorithms. 源码阅读
def insort_right(a, x, lo=0, hi=None, *, key=None):
    """Insert item x in list a, and keep it sorted assuming a is sorted.
    If x is already in a, insert it to the right of the rightmost x.
    Optional args lo (default 0) and hi (default len(a)) bound the
    slice of a to be searched.
    A custom key function can be supplied to customize the sort order.
    """
    if key is None:
        lo = bisect_right(a, x, lo, hi)
    else:
        lo = bisect_right(a, key(x), lo, hi, key=key)
    a.insert(lo, x)


def insort_left(a, x, lo=0, hi=None, *, key=None):
    """Insert item x in list a, and keep it sorted assuming a is sorted.
    If x is already in a, insert it to the left of the leftmost x.
    Optional args lo (default 0) and hi (default len(a)) bound the
    slice of a to be searched.
    A custom key function can be supplied to customize the sort order.
    """
    if key is None:
        lo = bisect_left(a, x, lo, hi)
    else:
        lo = bisect_left(a, key(x), lo, hi, key=key)
    a.insert(lo, x)


def bisect_right(a, x, lo=0, hi=None, *, key=None):
    """Return the index where to insert item x in list a, assuming a is sorted.
    The return value i is such that all e in a[:i] have e <= x, and all e in
    a[i:] have e > x.  So if x already appears in the list, a.insert(i, x) will
    insert just after the rightmost x already there.
    Optional args lo (default 0) and hi (default len(a)) bound the
    slice of a to be searched.
    A custom key function can be supplied to customize the sort order.
    """
    if lo < 0:
        raise ValueError('lo must be non-negative')
    if hi is None:
        hi = len(a)
    # Note, the comparison uses "<" to match the
    # __lt__() logic in list.sort() and in heapq.
    if key is None:
        while lo < hi:
            mid = (lo + hi) // 2
            if x < a[mid]:
                hi = mid
            else:
                lo = mid + 1
    else:
        while lo < hi:
            mid = (lo + hi) // 2
            if x < key(a[mid]):
                hi = mid
            else:
                lo = mid + 1
    return lo


def bisect_left(a, x, lo=0, hi=None, *, key=None):
    """Return the index where to insert item x in list a, assuming a is sorted。
    The return value i is such that all e in a[:i] have e < x, and all e in
    a[i:] have e >= x.  So if x already appears in the list, a.insert(i, x) will
    insert just before the leftmost x already there.
    Optional args lo (default 0) and hi (default len(a)) bound the
    slice of a to be searched.
    A custom key function can be supplied to customize the sort order.
    """
    if lo < 0:
        raise ValueError('lo must be non-negative')
    if hi is None:
        hi = len(a)
    # Note, the comparison uses "<" to match the
    # __lt__() logic in list.sort() and in heapq.
    if key is None:
        while lo < hi:
            mid = (lo + hi) // 2
            if a[mid] < x:
                lo = mid + 1
            else:
                hi = mid
    else:
        while lo < hi:
            mid = (lo + hi) // 2
            if key(a[mid]) < x:
                lo = mid + 1
            else:
                hi = mid
    return lo


# Application
# 二分查找函数
ps = [1, 3, 5, 9, 100]
T = 66
print(bisect.bisect_left(ps, T, lo=0, hi=len(ps)))  # 二分左边界
print(bisect.bisect_right(ps, T, lo=0, hi=len(ps)))  # 二分右边界
print(bisect.bisect_left(ps, 9, lo=0, hi=len(ps)))  # 二分左边界
print(bisect.bisect_right(ps, 9, lo=0, hi=len(ps)))  # 二分右边界
bisect.insort_left(ps, T, lo=0, hi=len(ps))  # 二分插入到左侧
bisect.insort_right(ps, T, lo=0, hi=len(ps))  # 二分插入到右侧


# 当时自我实现的,找到右边第一个比target的大的值的位置
def find_next_greater_o(arr, target):
    """相当于bisect_right"""
    low, high = 0, len(arr) - 1
    try:
        if target < arr[low]:
            return -1
    except Exception as e:
        return -1

    while low <= high:
        mid = low + (high - low) // 2
        if arr[mid] <= target:
            low = mid + 1
        else:
            if arr[mid - 1] <= target:
                return mid
            high = mid - 1
    return -1


# 利用bisect实现
def find_next_greater(arr, target):
    try:
        i = bisect_right(arr, target, lo=0, hi=len(arr))
        if i == len(arr):
            return -1
        return i
    except Exception as e:
        print(f"Error [{e}] happened when finding the [{target}], return default")
        return -1


def find_gt(a, x):
    # Find leftmost value greater than
    i = bisect_right(a, x)
    if i != len(a):
        return i
    return -1


arr = [3, 4, 6, 7]
target = 5


# print(find_next_greater(arr, target))
# print(find_gt(arr, target))
# print(find_next_greater(arr, 'fd'))
# print(find_next_greater(arr, 999))
# print(find_gt(arr, 999))

# def grade(score, breakpoints=[60, 70, 80, 90], grades='FDCBA'):
#     i = bisect(breakpoints, score)
#     return grades[i]
#
# a = [grade(score) for score in [33, 99, 77, 70, 89, 90, 100]]
# ['F', 'A', 'C', 'C', 'B', 'A', 'A']
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17
  • 18
  • 19
  • 20
  • 21
  • 22
  • 23
  • 24
  • 25
  • 26
  • 27
  • 28
  • 29
  • 30
  • 31
  • 32
  • 33
  • 34
  • 35
  • 36
  • 37
  • 38
  • 39
  • 40
  • 41
  • 42
  • 43
  • 44
  • 45
  • 46
  • 47
  • 48
  • 49
  • 50
  • 51
  • 52
  • 53
  • 54
  • 55
  • 56
  • 57
  • 58
  • 59
  • 60
  • 61
  • 62
  • 63
  • 64
  • 65
  • 66
  • 67
  • 68
  • 69
  • 70
  • 71
  • 72
  • 73
  • 74
  • 75
  • 76
  • 77
  • 78
  • 79
  • 80
  • 81
  • 82
  • 83
  • 84
  • 85
  • 86
  • 87
  • 88
  • 89
  • 90
  • 91
  • 92
  • 93
  • 94
  • 95
  • 96
  • 97
  • 98
  • 99
  • 100
  • 101
  • 102
  • 103
  • 104
  • 105
  • 106
  • 107
  • 108
  • 109
  • 110
  • 111
  • 112
  • 113
  • 114
  • 115
  • 116
  • 117
  • 118
  • 119
  • 120
  • 121
  • 122
  • 123
  • 124
  • 125
  • 126
  • 127
  • 128
  • 129
  • 130
  • 131
  • 132
  • 133
  • 134
  • 135
  • 136
  • 137
  • 138
  • 139
  • 140
  • 141
  • 142
  • 143
  • 144
  • 145
  • 146
  • 147
  • 148
  • 149
  • 150
  • 151
  • 152
  • 153
  • 154
  • 155
  • 156
  • 157
  • 158
  • 159
  • 160
  • 161
  • 162
  • 163
  • 164
  • 165
  • 166
  • 167
  • 168
  • 169
  • 170
  • 171
  • 172
  • 173

应用:数据分箱

# 数据分箱
def p_cut(bins: List[int], val: int, grades=None) -> int:
    """
    return the result of discretization input val;左闭右开;
    :param bins: bins for describe
    :param val: value to set
    :param grades: customize the output
    """
    if grades is not None:
        assert len(bins) == len(grades)
        grades = grades
    else:
        grades = reduce(lambda x, y: str(x) + str(y),
                        [i for i in range(len(bins) + 1)])

    # try if for cover typeerror
    try:
        i = bisect_right(bins, val)
        return int(grades[i])
    except TypeError as e:
        print(f"TypeError: {e} for input {val}")
        return -1

print("*" * 10 + "Discretization test" + "*" * 10)
print(p_cut([1, 10, 20, 50, 100], val=5))
print(p_cut([1, 10, 20, 50, 100], val=1))
print(p_cut([1, 10, 20, 50, 100], val=10))
print(p_cut([1, 10, 20, 50, 100], val=500))
print(p_cut([1, 10, 20, 50, 100], val=""))

bins = [1, 10, 20, 50, 100]
test_arr = [-100, None, 1, 5, 30, 500]
test_arr1 = [-100, 1, 5, 30, 500]
print([p_cut(bins, v) for v in test_arr])

# 同样的功能利用pandas cut实现
# pandas cut 不支持val是其他类型,会直接报typeerror;
# pandas cut 对于不在区间内的值做NaN处理
print(pd.cut(test_arr1, bins=bins, labels=['a', 'b', 'c', 'd']))

# 在大数据情况下,pandas cut可能不够高效,同样的功能还可以使用sklearn实现
data = np.array(test_arr1).reshape(-1, 1)  # 重塑数据为2D,因为 Scikit-Learn 需要2D数组
est = KBinsDiscretizer(n_bins=4, encode='ordinal', strategy='quantile')
est.fit(data)
bins_sk = est.bin_edges_[0]  # 获取边界
print(bins_sk)

# tensorflow实现
## tf.keras.layers.Discretization
## tf.keras.layers.Discretization 左闭右开
"""在tf1 中是使用 tf.feature_column.bucketized_column
但是在tf2 中官方推荐的是tf.keras.layers.Discretization
"""
discretization_layer = tf.keras.layers.Discretization(bin_boundaries=bins)
print(f"tf.keras output:{discretization_layer(test_arr1)}")

## tf.searchsorted or torch.searchsorted(bins, values, right=True) 左开右闭
print(tf.reshape(tf.searchsorted(bins, test_arr1), (-1, len(test_arr1))))

bins_tensor = torch.tensor(bins, dtype=torch.float32)
values = torch.tensor(test_arr1, dtype=torch.float32)
indices = torch.searchsorted(bins_tensor, values, right=False)

# 对比
print("*" * 10)
print(f"self code output:{[p_cut(bins, v) for v in test_arr1]}")
print(f"pd.cut output: {pd.cut(test_arr1, bins=bins)}")
print(f"tf.keras output:{discretization_layer(test_arr1)}")
print(f"torch.searchsorted output:{indices}")
print(f"tf.searchsorted output :{tf.searchsorted(bins, test_arr1)}")
print("*" * 10 + "Discretization test finished" + "*" * 10) 
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17
  • 18
  • 19
  • 20
  • 21
  • 22
  • 23
  • 24
  • 25
  • 26
  • 27
  • 28
  • 29
  • 30
  • 31
  • 32
  • 33
  • 34
  • 35
  • 36
  • 37
  • 38
  • 39
  • 40
  • 41
  • 42
  • 43
  • 44
  • 45
  • 46
  • 47
  • 48
  • 49
  • 50
  • 51
  • 52
  • 53
  • 54
  • 55
  • 56
  • 57
  • 58
  • 59
  • 60
  • 61
  • 62
  • 63
  • 64
  • 65
  • 66
  • 67
  • 68
  • 69
  • 70
  • 71
声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/盐析白兔/article/detail/556720
推荐阅读
相关标签
  

闽ICP备14008679号