赞
踩
source_database:數據庫
source_table:表
source_columns:列
source_splitPk:split key,要求必須是int類型
# coding=utf-8 import json import getopt import os import sys import MySQLdb #MySQL相关配置,需根据实际情况作出修改 mysql_host = "47.57.227.5" mysql_port = "3306" mysql_user = "vinson_readonly" mysql_passwd = "8AGY5Eqq8Ac8VR7b" #HDFS NameNode相关配置,需根据实际情况作出修改 hdfs_nn_host = "mycluster" hdfs_nn_port = "8020" #生成配置文件的目标路径,可根据实际情况作出修改 def get_connection(): return MySQLdb.connect(host=mysql_host, port=int(mysql_port), user=mysql_user, passwd=mysql_passwd) def get_mysql_meta(database, table, columns): connection = get_connection() cursor = connection.cursor() if columns == 'all': # 如果传入 '*' 表示要所有列 sql = "SELECT COLUMN_NAME, DATA_TYPE FROM information_schema.COLUMNS WHERE TABLE_SCHEMA='%s' AND TABLE_NAME='%s' ORDER BY ORDINAL_POSITION" % (database, table) else: # 传入指定列 # 将每个列名加上单引号 columns = ', '.join("'%s'" % col.strip() for col in columns.split(',')) sql = "SELECT COLUMN_NAME, DATA_TYPE FROM information_schema.COLUMNS WHERE TABLE_SCHEMA='%s' AND TABLE_NAME='%s' AND COLUMN_NAME IN (%s) ORDER BY ORDINAL_POSITION" % ( database, table, columns) cursor.execute(sql) fetchall = cursor.fetchall() # print(fetchall) cursor.close() connection.close() return fetchall def get_mysql_columns(database, table,source_columns): return map(lambda x: x[0], get_mysql_meta(database,table,source_columns)) def get_hive_columns(database, table,source_columns): def type_mapping(mysql_type): mappings = { "bigint": "bigint", "int": "bigint", "smallint": "bigint", "tinyint": "bigint", "mediumint": "bigint", "decimal": "string", "double": "double", "float": "float", "binary": "string", "char": "string", "varchar": "string", "datetime": "string", "time": "string", "timestamp": "string", "date": "string", "text": "string", "bit": "string", } return mappings[mysql_type] meta = get_mysql_meta(database, table,source_columns) return map(lambda x: {"name": x[0], "type": type_mapping(x[1].lower())}, meta) def generate_json(source_database, source_table,source_columns,source_splitPk): job = { "job": { "setting": { "speed": { "channel": 15 }, "errorLimit": { "record": 0, "percentage": 0.02 } }, "content": [{ "reader": { "name": "mysqlreader", "batchSize":"8192", "batchByteSize":"33554432", "parameter": { "username": mysql_user, "password": mysql_passwd, "column": get_mysql_columns(source_database, source_table,source_columns), "splitPk": source_splitPk, "connection": [{ "table": [source_table], "jdbcUrl": ["jdbc:mysql://" + mysql_host + ":" + mysql_port + "/" + source_database + "?userCompress=true&useCursorFetch=true&useUnicode=true&characterEncoding=utf-8&useSSL=false"] }] } }, "writer": { "name": "hdfswriter", "batchSize":"8192", "batchByteSize":"33554432", "parameter": { "defaultFS": "hdfs://" + hdfs_nn_host + ":" + hdfs_nn_port, "fileType": "text", "path": "${targetdir}", "fileName": source_table, "column": get_hive_columns(source_database, source_table,source_columns), "writeMode": "append", "fieldDelimiter": u"\u0001", "compress": "gzip" } }, "transformer": [ { "name": "dx_groovy", "parameter": { "code": "for(int i=0;i<record.getColumnNumber();i++){if(record.getColumn(i).getByteSize()!=0){Column column = record.getColumn(i); def str = column.asString(); def newStr=null; newStr=str.replaceAll(\"[\\r\\n]\",\"\"); record.setColumn(i, new StringColumn(newStr)); };};return record;", "extraPackage":[] } } ] }] } } output_path = "/opt/module/datax/job/import/" + source_database if not os.path.exists(output_path): os.makedirs(output_path) with open(os.path.join(output_path, ".".join([source_database, source_table, "json"])), "w") as f: json.dump(job, f) def main(args): source_database = "" source_table = "" source_columns = "" source_splitPk = "" options, arguments = getopt.getopt(args, 'd:t:c:k:', ['sourcedb=', 'sourcetbl=', 'columns=', 'splitPk=']) for opt_name, opt_value in options: if opt_name in ('-d', '--sourcedb'): source_database = opt_value if opt_name in ('-t', '--sourcetbl'): source_table = opt_value if opt_name in ('-c', '--columns'): source_columns = opt_value if opt_name in ('-k', '--splitPk'): source_splitPk = opt_value generate_json(source_database, source_table,source_columns,source_splitPk) if __name__ == '__main__': main(sys.argv[1:])
#!/bin/bash
python ~/bin/sap_gateway_gen_import_config.py -d db -t table -c Id,created_date -k selfincrementid
python ~/bin/sap_gateway_gen_import_config.py -d db -t table -c all -k selfincrementid
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。