gen_target_month_data.py 1.1 KB

123456789101112131415161718192021222324252627282930313233343536
  1. import pandas as pd
  2. from common.database_utils import database_util
  3. from common.log_utils import logFactory
  4. from common import constant
  5. from tqdm import tqdm
  6. click_client = database_util.get_client()
  7. logger = logFactory("preprocess data").log
  8. def write_df_to_pickle(data, filename):
  9. logger.info(f"开始写入pickle")
  10. data.to_pickle(f"./data/pkl/{filename}")
  11. logger.info(f"写入pickle完成,文件名{filename},文件大小{data.shape}", )
  12. def get_df_by_id_month(uuid_list, month):
  13. logger.info(f"开始读取{month}数据")
  14. sql1 = f"select *, length(EVENT_APP_USE.C) as I_appuse from Z_USER_TAG_FLAT_out_{month} where uuid in {uuid_list} order by I_appuse desc limit 1 by uuid,EVENT_SPNAME_C"
  15. all_data1 = click_client.execute(sql1)
  16. data_frame1 = pd.DataFrame(all_data1, columns=constant.origin_column_names)
  17. logger.info(f"data1的shape{data_frame1.shape}")
  18. return data_frame1
  19. if __name__ == '__main__':
  20. path = "./data/csv/8910/tuidinguuids.csv"
  21. uuids = list(set(pd.read_csv(path).values.reshape(-1).tolist()))
  22. df = get_df_by_id_month(uuids,'202110')
  23. write_df_to_pickle(df, f"dbscan.pkl")