0

我们的项目继承了一些pandas 代码在 databricks/koalas 上运行时在新创建的列中返回空值。我们试图弄清楚为什么和/或如何改变它以产生与我们在桌面上使用 python 得到的结果相同的结果。代码和示例配置如下:

配置

"customScores": [
    {
      "name": "PrimAdrInd_Score",
      "definition": "",
      "score": 0.75,
      "operator": "",
      "dependencies": [
        {
          "name": "PrimAdrInd",
          "value": "P",
          "relationship": "=",
          "definition": "If PrimAdrInd = P (Primary)"
        }
      ]
    },

功能

def create_custom_fields(df, calcs):
    for calc in calcs:
        col = calc['name']
        operator = calc['operator']
        op_count = len(calc['dependencies'])

        temp = pd.DataFrame()
        for i,op in enumerate(calc['dependencies']):
            if op['relationship'] == '=': temp[str(i)] = df[op['name']] == op['value']
            elif op['relationship'] == '>': temp[str(i)] = df[op['name']] > op['value']
            elif op['relationship'] == '>=': temp[str(i)] = df[op['name']] >= op['value']
            elif op['relationship'] == '<': temp[str(i)] = df[op['name']] < op['value']
            elif op['relationship'] == '<=': temp[str(i)] = df[op['name']] <= op['value']
            elif op['relationship'] == '!=': temp[str(i)] = df[op['name']] != op['value']

        if operator == 'and':
            temp['bool_score'] = temp.sum(axis=1)
            temp['score'] = temp['bool_score'].apply(lambda x: calc['score'] if x == op_count else 0)
        else:
            temp['bool_score'] = temp.sum(axis=1)
            temp['score'] = temp['bool_score'].apply(lambda x: calc['score'] if x > 1 else 0)

        df[col] = temp['score']

    return df


df = create_custom_fields(df, config['customScores'])
4

0 回答 0