我们正在尝试插入覆盖配置单元表。大多数时候它会按预期覆盖,即删除任何旧文件并替换新文件。我们看到此行为存在一些不一致之处,有时所有旧文件都不会被删除,但会创建新文件。这导致数据不一致。
我无法重现此行为。只是想知道是否有人遇到过类似的问题或对可能发生的事情有任何指示。
我们使用的是 hive 版本 2.1.1。
下面是orc表结构和插入覆盖命令。Fileid 是表中的唯一列。该表大小约为 500GB。
Hive 表结构:
CREATE EXTERNAL TABLE `tier0.file`(
`filegroup` struct<collection:struct<name:string,code:string,royaltystate:string,enterprisecollectionid:bigint,isactive:boolean,active:boolean,filefamily:string,contentfamily:string,cfwcollectionname:string,droplocation:string,applyembeddestinationsite:boolean,associatedsource:string,excluderestriction:boolean,ownershiptype:string,collectionid:bigint,notes:string,bundlerestrictions:array<struct<bundleid:bigint,bundletype:string>>,pricecodes:array<struct<collectioncode:string,pricecode:string,iptccategory:string>>>,istockcollection:string,events:array<string>,paidassignmentids:array<string>,sisterfiles:array<string>,clonedfiles:array<string>,vcd:array<string>,source:struct<parentsource:string,parentsourceid:bigint,childsource:string,childsourceid:bigint>>,
`filemanagement` struct<filemanagement:string,destinationsites:array<string>,readyforsale:boolean,readyforpublish:boolean,reviewstatus:string,excludedestinationsites:array<string>,displaystatus:string,inactivedate:string,pulledreason:string,pulledreasonaudit:string,approvaldate:string,futurepulledreason:string,futureinactivedate:string,futureactivedate:string>,
`primarylanguage` string,
`audithistory` struct<note:string,notecategory:string>,
`contents` array<struct<deliverylocation:string,contenttype:string,submission:array<struct<data:struct<mimetype:string,fileinfo:struct<filelocation:string,filesize:bigint,filename:string,checksum:string,checksumtype:string>,submitdate:string,createdate:string,mediaformat:string,offlinehd:boolean,postertime:double,shoottype:string,stripaudio:boolean,timein:string,timeout:string,videoencoding:struct<compression:string,bitdepth:string,bitrate:double,definition:string,framerate:string,framesize:string,scantype:string,wrapper:string,height:int,width:int,interlaced:boolean>,rotation:string,anamorphic:boolean,pixelwidth:int,pixelheight:int,colorprofile:string,samplesperpixel:string,resolution:string,resolutionunit:string,colormode:string,animated:boolean,imageorientation:string,filmformat:string,duration:string,artistname:string,directlicense:boolean,lyrichook:string,albumtitle:string,parenttrackid:string,key:string,timesignature:string,publicdomain:string,lyrics:string,tracktitle:string,tracktype:string,speed:string,genre:string,mood:string,lyricpov:string,instrument:string,vocal:string,transformedmetadata:map<string,string>,iptc:map<string,string>,exif:map<string,string>,xmp:map<string,string>,xmpraw:map<string,string>>,sizeid:int,sizename:string,keyname:string,schemauri:string,extension:string,fileindex:int,suffix:string,readonly:boolean,ismaster:boolean>>,filepack:array<struct<data:struct<mimetype:string,fileinfo:struct<filelocation:string,filesize:bigint,filename:string,checksum:string,checksumtype:string>,submitdate:string,createdate:string,mediaformat:string,offlinehd:boolean,postertime:double,shoottype:string,stripaudio:boolean,timein:string,timeout:string,videoencoding:struct<compression:string,bitdepth:string,bitrate:double,definition:string,framerate:string,framesize:string,scantype:string,wrapper:string,height:int,width:int,interlaced:boolean>,rotation:string,anamorphic:boolean,pixelwidth:int,pixelheight:int,colorprofile:string,samplesperpixel:string,resolution:string,resolutionunit:string,colormode:string,animated:boolean,imageorientation:string,filmformat:string,duration:string,artistname:string,directlicense:boolean,lyrichook:string,albumtitle:string,parenttrackid:string,key:string,timesignature:string,publicdomain:string,lyrics:string,tracktitle:string,tracktype:string,speed:string,genre:string,mood:string,lyricpov:string,instrument:string,vocal:string,transformedmetadata:map<string,string>,iptc:map<string,string>,exif:map<string,string>,xmp:map<string,string>,xmpraw:map<string,string>>,sizeid:int,sizename:string,keyname:string,schemauri:string,extension:string,fileindex:int,suffix:string,readonly:boolean,ismaster:boolean>>,createdate:string,camerashotdate:string,updatedate:string,audithistory:array<struct<note:string,notecategory:string>>,contract:struct<parentsource:string,contractid:bigint,contentprovidername:string,contentprovidertitle:string,vendornumber:bigint,childsource:string,parentsourceid:bigint,childsourceid:bigint,istockusername:string,istockuserid:bigint,iptccredit:string,signatorycontentprovidername:string,signatoryguid:string,startdate:string,enddate:string>,release:struct<releaseid:string,releaseinformation:string,releasemetadata:array<struct<releasemetadataid:string,aliasid:string,releasetype:string,filelocation:string,name:string,agerange:string,age:string,birthdate:string,gender:string,ethnicity:string,ethnicities:array<string>,talentid:array<string>,usage:array<string>,teamsreleaseid:string>>>,contentmanagement:struct<state:string,notes:string,messages:array<string>>,contentsource:struct<clientsystemid:string,submittedby:string,ingestionproviderid:int,submissionnotes:string,clientlastmodifieddate:string>,alternateids:array<struct<alternateid:string,alternateidtype:string>>,homeproperty:string,mediatype:int,colorpalettes:struct<rgbmodel:array<struct<red:int,green:int,blue:int,presence:string,x:string,y:string,density:string>>>,transcript:string,hasaudio:boolean,visualcolor:string,era:string,cliptype:string,productiontitle:string,footagespeed:string>>,
`submitdate` string,
`licensecharacteristics` struct<filefamily:string,restrictioninstructions:string,riskcategory:string,advancedroyaltybearing:boolean,pricingcode:string,callforimage:boolean,exclusivecontent:boolean,subscriptioneligible:boolean,publicistapprovalrequired:boolean,whollyowned:boolean,royaltybearing:string,bundletags:array<string>,paidassignment:boolean,preferredlicensemodel:string,exclusivity:string,parentbundlecollection:string,restrictions:array<struct<id:string,beginningdate:string,enddate:string,controlledrestrictions:array<string>>>>,
`fileid` string,
`updatedate` string,
`version` int,
`exclusionrouting` array<string>,
`inclusionrouting` array<string>,
`errors` map<string,array<struct<errorcode:string,message:string>>>,
`dp_schema` string,
`dp_source` string,
`dp_source_type` string,
`dp_proc_time` string)
ROW FORMAT SERDE
'org.apache.hadoop.hive.ql.io.orc.OrcSerde'
STORED AS INPUTFORMAT
'org.apache.hadoop.hive.ql.io.orc.OrcInputFormat'
OUTPUTFORMAT
'org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat'
LOCATION
's3a://bucket/tier0/file/'
插入覆盖命令:-
insert overwrite table stg.tier0_file
SELECT
filegroup,
filemanagement,
primarylanguage,
audithistory,
contents,
submitdate,
licensecharacteristics,
fileid,
updatedate,
version,
errors,
dp_schema ,
dp_source ,
dp_source_type ,
dp_proc_time
FROM (
SELECT
filegroup,
filemanagement,
primarylanguage,
audithistory,
contents,
submitdate,
licensecharacteristics,
fileid,
updatedate,
version,
errors,
dp_schema ,
dp_source ,
dp_source_type ,
dp_proc_time,
ROW_NUMBER() OVER(PARTITION BY fileid ORDER BY version DESC, dp_proc_time DESC) AS rownum
FROM
( SELECT
filegroup,filemanagement,primarylanguage,audithistory,contents,submitdate,licensecharacteristics,fileid,updatedate,version,errors,dp_schema,dp_source,dp_source_type,dp_proc_time
FROM tier0.file
UNION ALL
SELECT
filegroup,filemanagement,primarylanguage,audithistory,contents,submitdate,licensecharacteristics,fileid,updatedate,version,errors,dp_schema,dp_source,dp_source_type,dp_proc_time
FROM stg.file
) base ) rnk
where rnk.rownum = 1;