赞
踩
用途有两个:
本例场景限制:只有读表权限
通常来说,表里的数据会有行号,但有时也不一定。如果有写表权限的话,那么就自己增加一个行号,便于小批量的读取。
select * from
(select * , (@rownum:= @rownum + 1) as rownum from some_table ) as t
where t.rownum > 100
limit 100;
# 修改表结构
ALTER TABLE t_ds_court ADD row_number int
SET @row = 0
UPDATE some_table SET row_number = (@row: =@row+1)
问题是表太大,所以需要读取时分块执行
全表很大,但是键值通常并不大
import time import pymysql # 获取某一列的数据,加筛选条件 def exe_sql_with_cursor(the_sql, cfg_mysql): connection = pymysql.connect(**cfg_mysql) start = time.time() * 1000 try: with connection.cursor() as cursor: cursor.execute(the_sql) connection.commit() res = cursor.fetchall() except: print('[E] SQL Execution Error ') res = None finally: connection.close() end = time.time() * 1000 print('[I] SQL Done Within %.2f ms' % (end-start)) return res the_sql = 'select id from some_table' key_list = exe_sql_with_cursor(the_sql, cfg_mysql) # 这带宽... [I] SQL Done Within 298283.59 ms In [13]: key_list[:3] Out[13]: ((753117,), (753118,), (753119,))
将列表扁平化后看看是否是等差数列
key_list_ravel = [x[0] for x in key_list] key_list_ravel.sort() import numpy as np # 判断一个一维列表是否为(公差为1的)等差数列 def is_diff1_num(some_list): some_arr = np.array(some_list) some_arr.astype(float) some_arr1 = np.diff(some_arr,1) diff_nums = np.unique(some_arr1) if len(diff_nums) == 1: status = True else: status = False return status, diff_nums[0] is_diff1, diffnum = is_diff1_num(key_list_ravel[:100])
结果果然不是。然后按照百分位取数(以前用pandas也做过,以后打算慢慢迁移到使用numpy上)
# 获取有效分位数(浮点数) ,最大最小使用实际的数(32位范围)而不是np.inf
def get_list_ptiles(some_list, pct_list = None, cuts = None , min_val=-1e300, max_val=1e300):
if pct_list is None:
# linspace 会生成包含两端的列表 [0. , 0.11111111, 0.22222222, 0.33333333, 0.44444444, 1.]
# 注意要转为0~100的值,不是小数
pct_list = np.linspace(0,1, cuts) *100
# 获取实际分位数后去重
cut_nums = np.unique(np.percentile(some_list, pct_list))
# 为首尾增加最大最小
cut_nums[0] = min_val
cut_nums[-1] = max_val
return list(cut_nums)
把之前的函数稍作修改,以便支持按条件读取
import pymysql as pyl import pandas as pd # 按条件读取数据并转为表 def mysql_read_table_where(table_name,where, cfg_mysql=None): conn = pyl.connect(**cfg_mysql) try: with conn.cursor() as cursor: tem_sql = 'select * from %s ' % table_name + ' ' + where cursor.execute(tem_sql) # 增加 col_dis = cursor.description col = [] for x in list(col_dis): col.append(x[0]) # print(col) result = cursor.fetchall() df = pd.DataFrame(list(result)) df.columns = col conn.commit() res = df except: res = False finally: conn.close() return res
最后加上tqdm完成整个工作
import tqdm
# 分批次循环的读取mysql里的表格, 使用tqdm显示。需要一个数值型的变量和一个数值型列表(序号列表)。
def read_mysql_table_by_iter(table_name, idx_list ,cfg_mysql, where_template='where id >= %s and id <%s' ):
res_df_list = []
for idx in tqdm.tqdm(range(1, len(idx_list)-1)):
where_condition = where_template % (idx_list[idx], idx_list[idx +1])
tem_df = mysql_read_table_where(table_name, where_condition, cfg_mysql=cfg_mysql)
res_df_list.append(tem_df)
return pd.concat(res_df_list, ignore_index=True)
key_split_index_list = get_list_ptiles(key_list_ravel, cuts=1000)
res_df = read_mysql_table_by_iter('some_table', key_split_index_list, cfg_mysql)
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。