- # 71、pandas.tseries.api.guess_datetime_format函数
- pandas.tseries.api.guess_datetime_format(dt_str, dayfirst=False)
- Guess the datetime format of a given datetime string.
- Parameters:
- dt_strstr
- Datetime string to guess the format of.
- dayfirstbool, default False
- If True parses dates with the day first, eg 20/01/2005
- Warning
- dayfirst=True is not strict, but will prefer to parse with day first (this is a known bug).
- Returns:
- str or None
- ret
- datetime format string (for strftime or strptime), or None if it can’t be guessed.

- import pandas as pd
- dt_str = '2024-07-17'
- format_guess = pd.tseries.api.guess_datetime_format(dt_str)
- print(format_guess)
- # %Y-%m-%d
- # 72、pandas.util.hash_array函数
- pandas.util.hash_array(vals, encoding='utf8', hash_key='0123456789123456', categorize=True)
- Given a 1d array, return an array of deterministic integers.
- Parameters:
- vals
- ndarray or ExtensionArray
- encoding
- str, default ‘utf8’
- Encoding for data & key when strings.
- hash_key
- str, default _default_hash_key
- Hash_key for string key to encode.
- categorize
- bool, default True
- Whether to first categorize object arrays before hashing. This is more efficient when the array contains duplicate values.
- Returns:
- ndarray[np.uint64, ndim=1]
- Hashed values, same length as the vals.

72-2-4、categorize(可选,默认值为True):当此参数为True时,函数会首先尝试将输入数组中的元素分类(即,将相同的元素映射到相同的整数标签上),然后再对这些整数标签进行哈希处理,这可以在一定程度上减少哈希碰撞的可能性,并提高哈希过程的效率。如果输入数组已经是分类类型(Categorical dtype),则此参数的效果会更加明显。
给定一个一维数组(如NumPy数组或Pandas Series),返回该数组中每个元素的哈希值所组成的数组。
- import pandas as pd
- # 创建一个包含字符串的Series
- s = pd.Series(['Jimmy', 'Bryce', 'Myelsa'])
- # 计算每个字符串的哈希值
- hashed_values = pd.util.hash_array(s.values)
- # 输出哈希值数组
- print(hashed_values)
- # [1382347394209841164 9798869407607568009 6426393181695770081]
- # 73、pandas.util.hash_pandas_object函数
- pandas.util.hash_pandas_object(obj, index=True, encoding='utf8', hash_key='0123456789123456', categorize=True)
- Return a data hash of the Index/Series/DataFrame.
- Parameters:
- obj
- Index, Series, or DataFrame
- index
- bool, default True
- Include the index in the hash (if Series/DataFrame).
- encoding
- str, default ‘utf8’
- Encoding for data & key when strings.
- hash_key
- str, default _default_hash_key
- Hash_key for string key to encode.
- categorize
- bool, default True
- Whether to first categorize object arrays before hashing. This is more efficient when the array contains duplicate values.
- Returns:
- Series of uint64, same length as the object.

- # 73-1、对DataFrame进行哈希处理
- import pandas as pd
- # 创建一个简单的DataFrame
- df = pd.DataFrame({
- 'A': [1, 2, 3],
- 'B': ['a', 'b', 'c'],
- 'C': [True, False, True]
- })
- # 对DataFrame进行哈希处理,包括索引
- hashes = pd.util.hash_pandas_object(df, index=True)
- print("Hashes with index included:", hashes)
- # 对DataFrame进行哈希处理,不包括索引
- hashes_no_index = pd.util.hash_pandas_object(df, index=False)
- print("Hashes without index:", hashes_no_index, end='\n\n')
- # 73-2、对Series进行哈希处理
- import pandas as pd
- # 创建一个 Series
- s = pd.Series([1, 2, 3, 4, 5])
- # 对 Series 进行哈希处理
- series_hashes = pd.util.hash_pandas_object(s)
- print("Hashes for Series:", series_hashes)
- # 注意:对于Series,通常没有索引(除非明确指定),但你可以通过reset_index()将其转换为DataFrame并包含索引
- s_df = s.reset_index(drop=False)
- s_df_hashes_with_index = pd.util.hash_pandas_object(s_df)
- print("Hashes for Series as DataFrame with index:", s_df_hashes_with_index, end='\n\n')
- # 73-3、处理包含浮点数的DataFrame
- import pandas as pd
- # 创建一个包含浮点数的DataFrame
- df_floats = pd.DataFrame({
- 'A': [1.0, 2.0, 3.0000000000001], # 注意最后一个数有微小的精度差异
- 'B': [4.0, 5.0, 6.0]
- })
- # 对DataFrame进行哈希处理
- floats_hashes = pd.util.hash_pandas_object(df_floats)
- print("Hashes for DataFrame with floats:", floats_hashes, end='\n\n')
- # 注意:由于浮点数的精度问题,即使 '3.0' 和 '3.0000000000001' 在数值上接近,它们的哈希值也可能不同
- # 73-4、对包含分类数据的DataFrame进行哈希处理
- import pandas as pd
- # 创建一个包含分类数据的DataFrame
- df_categorical = pd.DataFrame({
- 'A': pd.Categorical(['a', 'b', 'a', 'c']),
- 'B': [1, 2, 3, 4]
- })
- # 对DataFrame进行哈希处理,包括分类数据的整数编码
- categorical_hashes = pd.util.hash_pandas_object(df_categorical, categorize=True)
- print("Hashes with categorical data (categorize=True):", categorical_hashes)
- # 如果categorize=False,则直接对分类对象的字符串表示进行哈希处理
- categorical_hashes_no_categorize = pd.util.hash_pandas_object(df_categorical, categorize=False)
- print("Hashes with categorical data (categorize=False):", categorical_hashes_no_categorize, end='\n\n')
- # 73-5、对包含时间戳的DataFrame进行哈希处理
- import pandas as pd
- # 创建一个包含时间戳的DataFrame
- df_timestamps = pd.DataFrame({
- 'Timestamp': pd.to_datetime(['2024-07-01', '2024-07-05', '2024-07-17'])
- })
- # 对DataFrame进行哈希处理
- timestamp_hashes = pd.util.hash_pandas_object(df_timestamps)
- print("Hashes for DataFrame with timestamps:", timestamp_hashes, end='\n\n')
- # 73-6、处理具有NaN值的DataFrame
- import pandas as pd
- # 创建一个包含NaN值的DataFrame
- df_nan = pd.DataFrame({
- 'A': [1, 2, None, 4],
- 'B': ['a', 'b', 'c', None]
- })
- # 对DataFrame进行哈希处理
- nan_hashes = pd.util.hash_pandas_object(df_nan)
- print("Hashes for DataFrame with NaN values:", nan_hashes, end='\n\n')
- # 注意:NaN 值会以某种方式影响哈希值,但具体方式取决于Pandas的内部实现
- # 73-7、对大型DataFrame进行哈希处理
- import pandas as pd
- import numpy as np
- # 创建一个大型DataFrame(这里仅作为示例,实际中可能更大)
- np.random.seed(0)
- large_df = pd.DataFrame({
- 'A': np.random.randint(0, 100, size=100000),
- 'B': np.random.choice(['a', 'b', 'c', 'd'], size=100000),
- 'C': np.random.rand(100000)
- })
- # 对大型DataFrame进行哈希处理(注意:这可能需要一些时间)
- large_hashes = pd.util.hash_pandas_object(large_df, index=False)
- print("Hashes for a large DataFrame (first 10):", large_hashes[:10])

- # 73-1、对DataFrame进行哈希处理
- # Hashes with index included: 0 9483444313420146699
- # 1 5719781360446296993
- # 2 9877900052590456950
- # dtype: uint64
- # Hashes without index: 0 485995293390257589
- # 1 3005746743269222528
- # 2 15997040775864825588
- # dtype: uint64
- # 73-2、对Series进行哈希处理
- # Hashes for Series: 0 14639053686158035780
- # 1 3869563279212530728
- # 2 393322362522515241
- # 3 4080319230603510727
- # 4 13014484659661894915
- # dtype: uint64
- # Hashes for Series as DataFrame with index: 0 5967740633143088628
- # 1 9280677857880118003
- # 2 6253357580284104503
- # 3 4295247446495588871
- # 4 12355848007932281175
- # dtype: uint64
- # 73-3、处理包含浮点数的DataFrame
- # Hashes for DataFrame with floats: 0 12179765616421863049
- # 1 4850516111580897109
- # 2 13664998175358214438
- # dtype: uint64
- # 73-4、对包含分类数据的DataFrame进行哈希处理
- # Hashes with categorical data (categorize=True): 0 10448339489922407492
- # 1 328955597323365005
- # 2 17337560684877153397
- # 3 7837014030697196839
- # dtype: uint64
- # Hashes with categorical data (categorize=False): 0 10448339489922407492
- # 1 328955597323365005
- # 2 17337560684877153397
- # 3 7837014030697196839
- # dtype: uint64
- # 73-5、对包含时间戳的DataFrame进行哈希处理
- # Hashes for DataFrame with timestamps: 0 5029448861734248502
- # 1 15824968476515617805
- # 2 16154582340151959443
- # dtype: uint64
- # 73-6、处理具有NaN值的DataFrame
- # Hashes for DataFrame with NaN values: 0 390400230840112733
- # 1 9079939592730820628
- # 2 13397986763273461122
- # 3 18293157628943714066
- # dtype: uint64
- # 73-7、对大型DataFrame进行哈希处理
- # Hashes for a large DataFrame (first 10): 0 3741430234977074334
- # 1 13815252830947086855
- # 2 5536596816482122074
- # 3 7459333729972558407
- # 4 10226587178543578329
- # 5 16985416363548347045
- # 6 5210640794891753453
- # 7 334407979408579242
- # 8 3392106525313158311
- # 9 17317895163462122340
- # dtype: uint64

