我们的项目继承了一些pandas 代码在 databricks/koalas 上运行时在新创建的列中返回空值。我们试图弄清楚为什么和/或如何改变它以产生与我们在桌面上使用 python 得到的结果相同的结果。代码和示例配置如下:
配置
"customScores": [
{
"name": "PrimAdrInd_Score",
"definition": "",
"score": 0.75,
"operator": "",
"dependencies": [
{
"name": "PrimAdrInd",
"value": "P",
"relationship": "=",
"definition": "If PrimAdrInd = P (Primary)"
}
]
},
功能
def create_custom_fields(df, calcs):
for calc in calcs:
col = calc['name']
operator = calc['operator']
op_count = len(calc['dependencies'])
temp = pd.DataFrame()
for i,op in enumerate(calc['dependencies']):
if op['relationship'] == '=': temp[str(i)] = df[op['name']] == op['value']
elif op['relationship'] == '>': temp[str(i)] = df[op['name']] > op['value']
elif op['relationship'] == '>=': temp[str(i)] = df[op['name']] >= op['value']
elif op['relationship'] == '<': temp[str(i)] = df[op['name']] < op['value']
elif op['relationship'] == '<=': temp[str(i)] = df[op['name']] <= op['value']
elif op['relationship'] == '!=': temp[str(i)] = df[op['name']] != op['value']
if operator == 'and':
temp['bool_score'] = temp.sum(axis=1)
temp['score'] = temp['bool_score'].apply(lambda x: calc['score'] if x == op_count else 0)
else:
temp['bool_score'] = temp.sum(axis=1)
temp['score'] = temp['bool_score'].apply(lambda x: calc['score'] if x > 1 else 0)
df[col] = temp['score']
return df
df = create_custom_fields(df, config['customScores'])