赞
踩
python bisect 工具以及应用
主要用于在顺序固定的序列中查找以及插入
内置了四种方法
bisect_left/right 方法找到应该插入元素的位置,对于和序列中元素不相同的值,两个方法返回的一样,对于相同的值,left返回相同值的位置,right返回相同值下一个位置
例如在ps = [1, 3, 5, 9, 9, 100]
query = 9
的话,left返回3,right返回5
import bisect import pandas as pd import numpy as np from functools import reduce from typing import List import tensorflow as tf import torch from sklearn.preprocessing import KBinsDiscretizer # Bisection algorithms. 源码阅读 def insort_right(a, x, lo=0, hi=None, *, key=None): """Insert item x in list a, and keep it sorted assuming a is sorted. If x is already in a, insert it to the right of the rightmost x. Optional args lo (default 0) and hi (default len(a)) bound the slice of a to be searched. A custom key function can be supplied to customize the sort order. """ if key is None: lo = bisect_right(a, x, lo, hi) else: lo = bisect_right(a, key(x), lo, hi, key=key) a.insert(lo, x) def insort_left(a, x, lo=0, hi=None, *, key=None): """Insert item x in list a, and keep it sorted assuming a is sorted. If x is already in a, insert it to the left of the leftmost x. Optional args lo (default 0) and hi (default len(a)) bound the slice of a to be searched. A custom key function can be supplied to customize the sort order. """ if key is None: lo = bisect_left(a, x, lo, hi) else: lo = bisect_left(a, key(x), lo, hi, key=key) a.insert(lo, x) def bisect_right(a, x, lo=0, hi=None, *, key=None): """Return the index where to insert item x in list a, assuming a is sorted. The return value i is such that all e in a[:i] have e <= x, and all e in a[i:] have e > x. So if x already appears in the list, a.insert(i, x) will insert just after the rightmost x already there. Optional args lo (default 0) and hi (default len(a)) bound the slice of a to be searched. A custom key function can be supplied to customize the sort order. """ if lo < 0: raise ValueError('lo must be non-negative') if hi is None: hi = len(a) # Note, the comparison uses "<" to match the # __lt__() logic in list.sort() and in heapq. if key is None: while lo < hi: mid = (lo + hi) // 2 if x < a[mid]: hi = mid else: lo = mid + 1 else: while lo < hi: mid = (lo + hi) // 2 if x < key(a[mid]): hi = mid else: lo = mid + 1 return lo def bisect_left(a, x, lo=0, hi=None, *, key=None): """Return the index where to insert item x in list a, assuming a is sorted。 The return value i is such that all e in a[:i] have e < x, and all e in a[i:] have e >= x. So if x already appears in the list, a.insert(i, x) will insert just before the leftmost x already there. Optional args lo (default 0) and hi (default len(a)) bound the slice of a to be searched. A custom key function can be supplied to customize the sort order. """ if lo < 0: raise ValueError('lo must be non-negative') if hi is None: hi = len(a) # Note, the comparison uses "<" to match the # __lt__() logic in list.sort() and in heapq. if key is None: while lo < hi: mid = (lo + hi) // 2 if a[mid] < x: lo = mid + 1 else: hi = mid else: while lo < hi: mid = (lo + hi) // 2 if key(a[mid]) < x: lo = mid + 1 else: hi = mid return lo # Application # 二分查找函数 ps = [1, 3, 5, 9, 100] T = 66 print(bisect.bisect_left(ps, T, lo=0, hi=len(ps))) # 二分左边界 print(bisect.bisect_right(ps, T, lo=0, hi=len(ps))) # 二分右边界 print(bisect.bisect_left(ps, 9, lo=0, hi=len(ps))) # 二分左边界 print(bisect.bisect_right(ps, 9, lo=0, hi=len(ps))) # 二分右边界 bisect.insort_left(ps, T, lo=0, hi=len(ps)) # 二分插入到左侧 bisect.insort_right(ps, T, lo=0, hi=len(ps)) # 二分插入到右侧 # 当时自我实现的,找到右边第一个比target的大的值的位置 def find_next_greater_o(arr, target): """相当于bisect_right""" low, high = 0, len(arr) - 1 try: if target < arr[low]: return -1 except Exception as e: return -1 while low <= high: mid = low + (high - low) // 2 if arr[mid] <= target: low = mid + 1 else: if arr[mid - 1] <= target: return mid high = mid - 1 return -1 # 利用bisect实现 def find_next_greater(arr, target): try: i = bisect_right(arr, target, lo=0, hi=len(arr)) if i == len(arr): return -1 return i except Exception as e: print(f"Error [{e}] happened when finding the [{target}], return default") return -1 def find_gt(a, x): # Find leftmost value greater than i = bisect_right(a, x) if i != len(a): return i return -1 arr = [3, 4, 6, 7] target = 5 # print(find_next_greater(arr, target)) # print(find_gt(arr, target)) # print(find_next_greater(arr, 'fd')) # print(find_next_greater(arr, 999)) # print(find_gt(arr, 999)) # def grade(score, breakpoints=[60, 70, 80, 90], grades='FDCBA'): # i = bisect(breakpoints, score) # return grades[i] # # a = [grade(score) for score in [33, 99, 77, 70, 89, 90, 100]] # ['F', 'A', 'C', 'C', 'B', 'A', 'A']
应用:数据分箱
# 数据分箱 def p_cut(bins: List[int], val: int, grades=None) -> int: """ return the result of discretization input val;左闭右开; :param bins: bins for describe :param val: value to set :param grades: customize the output """ if grades is not None: assert len(bins) == len(grades) grades = grades else: grades = reduce(lambda x, y: str(x) + str(y), [i for i in range(len(bins) + 1)]) # try if for cover typeerror try: i = bisect_right(bins, val) return int(grades[i]) except TypeError as e: print(f"TypeError: {e} for input {val}") return -1 print("*" * 10 + "Discretization test" + "*" * 10) print(p_cut([1, 10, 20, 50, 100], val=5)) print(p_cut([1, 10, 20, 50, 100], val=1)) print(p_cut([1, 10, 20, 50, 100], val=10)) print(p_cut([1, 10, 20, 50, 100], val=500)) print(p_cut([1, 10, 20, 50, 100], val="")) bins = [1, 10, 20, 50, 100] test_arr = [-100, None, 1, 5, 30, 500] test_arr1 = [-100, 1, 5, 30, 500] print([p_cut(bins, v) for v in test_arr]) # 同样的功能利用pandas cut实现 # pandas cut 不支持val是其他类型,会直接报typeerror; # pandas cut 对于不在区间内的值做NaN处理 print(pd.cut(test_arr1, bins=bins, labels=['a', 'b', 'c', 'd'])) # 在大数据情况下,pandas cut可能不够高效,同样的功能还可以使用sklearn实现 data = np.array(test_arr1).reshape(-1, 1) # 重塑数据为2D,因为 Scikit-Learn 需要2D数组 est = KBinsDiscretizer(n_bins=4, encode='ordinal', strategy='quantile') est.fit(data) bins_sk = est.bin_edges_[0] # 获取边界 print(bins_sk) # tensorflow实现 ## tf.keras.layers.Discretization ## tf.keras.layers.Discretization 左闭右开 """在tf1 中是使用 tf.feature_column.bucketized_column 但是在tf2 中官方推荐的是tf.keras.layers.Discretization """ discretization_layer = tf.keras.layers.Discretization(bin_boundaries=bins) print(f"tf.keras output:{discretization_layer(test_arr1)}") ## tf.searchsorted or torch.searchsorted(bins, values, right=True) 左开右闭 print(tf.reshape(tf.searchsorted(bins, test_arr1), (-1, len(test_arr1)))) bins_tensor = torch.tensor(bins, dtype=torch.float32) values = torch.tensor(test_arr1, dtype=torch.float32) indices = torch.searchsorted(bins_tensor, values, right=False) # 对比 print("*" * 10) print(f"self code output:{[p_cut(bins, v) for v in test_arr1]}") print(f"pd.cut output: {pd.cut(test_arr1, bins=bins)}") print(f"tf.keras output:{discretization_layer(test_arr1)}") print(f"torch.searchsorted output:{indices}") print(f"tf.searchsorted output :{tf.searchsorted(bins, test_arr1)}") print("*" * 10 + "Discretization test finished" + "*" * 10)
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。