1

我们正在尝试插入覆盖配置单元表。大多数时候它会按预期覆盖,即删除任何旧文件并替换新文件。我们看到此行为存在一些不一致之处,有时所有旧文件都不会被删除,但会创建新文件。这导致数据不一致。

我无法重现此行为。只是想知道是否有人遇到过类似的问题或对可能发生的事情有任何指示。

我们使用的是 hive 版本 2.1.1。

下面是orc表结构和插入覆盖命令。Fileid 是表中的唯一列。该表大小约为 500GB。

Hive 表结构:

CREATE EXTERNAL TABLE `tier0.file`(
  `filegroup` struct<collection:struct<name:string,code:string,royaltystate:string,enterprisecollectionid:bigint,isactive:boolean,active:boolean,filefamily:string,contentfamily:string,cfwcollectionname:string,droplocation:string,applyembeddestinationsite:boolean,associatedsource:string,excluderestriction:boolean,ownershiptype:string,collectionid:bigint,notes:string,bundlerestrictions:array<struct<bundleid:bigint,bundletype:string>>,pricecodes:array<struct<collectioncode:string,pricecode:string,iptccategory:string>>>,istockcollection:string,events:array<string>,paidassignmentids:array<string>,sisterfiles:array<string>,clonedfiles:array<string>,vcd:array<string>,source:struct<parentsource:string,parentsourceid:bigint,childsource:string,childsourceid:bigint>>, 
  `filemanagement` struct<filemanagement:string,destinationsites:array<string>,readyforsale:boolean,readyforpublish:boolean,reviewstatus:string,excludedestinationsites:array<string>,displaystatus:string,inactivedate:string,pulledreason:string,pulledreasonaudit:string,approvaldate:string,futurepulledreason:string,futureinactivedate:string,futureactivedate:string>, 
  `primarylanguage` string, 
  `audithistory` struct<note:string,notecategory:string>, 
  `contents` array<struct<deliverylocation:string,contenttype:string,submission:array<struct<data:struct<mimetype:string,fileinfo:struct<filelocation:string,filesize:bigint,filename:string,checksum:string,checksumtype:string>,submitdate:string,createdate:string,mediaformat:string,offlinehd:boolean,postertime:double,shoottype:string,stripaudio:boolean,timein:string,timeout:string,videoencoding:struct<compression:string,bitdepth:string,bitrate:double,definition:string,framerate:string,framesize:string,scantype:string,wrapper:string,height:int,width:int,interlaced:boolean>,rotation:string,anamorphic:boolean,pixelwidth:int,pixelheight:int,colorprofile:string,samplesperpixel:string,resolution:string,resolutionunit:string,colormode:string,animated:boolean,imageorientation:string,filmformat:string,duration:string,artistname:string,directlicense:boolean,lyrichook:string,albumtitle:string,parenttrackid:string,key:string,timesignature:string,publicdomain:string,lyrics:string,tracktitle:string,tracktype:string,speed:string,genre:string,mood:string,lyricpov:string,instrument:string,vocal:string,transformedmetadata:map<string,string>,iptc:map<string,string>,exif:map<string,string>,xmp:map<string,string>,xmpraw:map<string,string>>,sizeid:int,sizename:string,keyname:string,schemauri:string,extension:string,fileindex:int,suffix:string,readonly:boolean,ismaster:boolean>>,filepack:array<struct<data:struct<mimetype:string,fileinfo:struct<filelocation:string,filesize:bigint,filename:string,checksum:string,checksumtype:string>,submitdate:string,createdate:string,mediaformat:string,offlinehd:boolean,postertime:double,shoottype:string,stripaudio:boolean,timein:string,timeout:string,videoencoding:struct<compression:string,bitdepth:string,bitrate:double,definition:string,framerate:string,framesize:string,scantype:string,wrapper:string,height:int,width:int,interlaced:boolean>,rotation:string,anamorphic:boolean,pixelwidth:int,pixelheight:int,colorprofile:string,samplesperpixel:string,resolution:string,resolutionunit:string,colormode:string,animated:boolean,imageorientation:string,filmformat:string,duration:string,artistname:string,directlicense:boolean,lyrichook:string,albumtitle:string,parenttrackid:string,key:string,timesignature:string,publicdomain:string,lyrics:string,tracktitle:string,tracktype:string,speed:string,genre:string,mood:string,lyricpov:string,instrument:string,vocal:string,transformedmetadata:map<string,string>,iptc:map<string,string>,exif:map<string,string>,xmp:map<string,string>,xmpraw:map<string,string>>,sizeid:int,sizename:string,keyname:string,schemauri:string,extension:string,fileindex:int,suffix:string,readonly:boolean,ismaster:boolean>>,createdate:string,camerashotdate:string,updatedate:string,audithistory:array<struct<note:string,notecategory:string>>,contract:struct<parentsource:string,contractid:bigint,contentprovidername:string,contentprovidertitle:string,vendornumber:bigint,childsource:string,parentsourceid:bigint,childsourceid:bigint,istockusername:string,istockuserid:bigint,iptccredit:string,signatorycontentprovidername:string,signatoryguid:string,startdate:string,enddate:string>,release:struct<releaseid:string,releaseinformation:string,releasemetadata:array<struct<releasemetadataid:string,aliasid:string,releasetype:string,filelocation:string,name:string,agerange:string,age:string,birthdate:string,gender:string,ethnicity:string,ethnicities:array<string>,talentid:array<string>,usage:array<string>,teamsreleaseid:string>>>,contentmanagement:struct<state:string,notes:string,messages:array<string>>,contentsource:struct<clientsystemid:string,submittedby:string,ingestionproviderid:int,submissionnotes:string,clientlastmodifieddate:string>,alternateids:array<struct<alternateid:string,alternateidtype:string>>,homeproperty:string,mediatype:int,colorpalettes:struct<rgbmodel:array<struct<red:int,green:int,blue:int,presence:string,x:string,y:string,density:string>>>,transcript:string,hasaudio:boolean,visualcolor:string,era:string,cliptype:string,productiontitle:string,footagespeed:string>>, 
  `submitdate` string, 
  `licensecharacteristics` struct<filefamily:string,restrictioninstructions:string,riskcategory:string,advancedroyaltybearing:boolean,pricingcode:string,callforimage:boolean,exclusivecontent:boolean,subscriptioneligible:boolean,publicistapprovalrequired:boolean,whollyowned:boolean,royaltybearing:string,bundletags:array<string>,paidassignment:boolean,preferredlicensemodel:string,exclusivity:string,parentbundlecollection:string,restrictions:array<struct<id:string,beginningdate:string,enddate:string,controlledrestrictions:array<string>>>>, 
  `fileid` string, 
  `updatedate` string, 
  `version` int, 
  `exclusionrouting` array<string>, 
  `inclusionrouting` array<string>, 
  `errors` map<string,array<struct<errorcode:string,message:string>>>, 
  `dp_schema` string, 
  `dp_source` string, 
  `dp_source_type` string, 
  `dp_proc_time` string)
ROW FORMAT SERDE 
  'org.apache.hadoop.hive.ql.io.orc.OrcSerde' 
STORED AS INPUTFORMAT 
  'org.apache.hadoop.hive.ql.io.orc.OrcInputFormat' 
OUTPUTFORMAT 
  'org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat'
LOCATION
  's3a://bucket/tier0/file/'

插入覆盖命令:-

insert overwrite table stg.tier0_file
SELECT 
  filegroup, 
  filemanagement, 
  primarylanguage,
  audithistory,
  contents,
  submitdate,
  licensecharacteristics,  
  fileid,  
  updatedate,  
  version,  
  errors,
  dp_schema , 
  dp_source , 
  dp_source_type , 
  dp_proc_time
FROM (
SELECT 
  filegroup, 
  filemanagement, 
  primarylanguage,
  audithistory,
  contents,
  submitdate,
  licensecharacteristics,  
  fileid,  
  updatedate,  
  version, 
  errors,
  dp_schema , 
  dp_source , 
  dp_source_type , 
  dp_proc_time,
  ROW_NUMBER() OVER(PARTITION BY fileid     ORDER BY        version DESC,       dp_proc_time DESC) AS rownum
  FROM 
  (   SELECT 
          filegroup,filemanagement,primarylanguage,audithistory,contents,submitdate,licensecharacteristics,fileid,updatedate,version,errors,dp_schema,dp_source,dp_source_type,dp_proc_time
      FROM tier0.file
      UNION ALL
      SELECT
          filegroup,filemanagement,primarylanguage,audithistory,contents,submitdate,licensecharacteristics,fileid,updatedate,version,errors,dp_schema,dp_source,dp_source_type,dp_proc_time
      FROM stg.file
  ) base ) rnk 
where rnk.rownum = 1;
4

0 回答 0