1.这两天在做hive集群迁移 hive1.2迁移至hive 2.1,在做迁移的过程中遇到一个问题

Exception in thread "main" java.lang.AssertionError: Internal error: No assign rules for MAP defined
        at org.apache.calcite.util.Util.newInternal(Util.java:774)
        at org.apache.calcite.sql.type.SqlTypeAssignmentRules.canCastFrom(SqlTypeAssignmentRules.java:346)
        at org.apache.calcite.sql.type.SqlTypeUtil.canCastFrom(SqlTypeUtil.java:843)
        at org.apache.calcite.sql.type.SqlTypeFactoryImpl.leastRestrictiveByCast(SqlTypeFactoryImpl.java:172)
        at org.apache.calcite.sql.type.SqlTypeFactoryImpl.leastRestrictive(SqlTypeFactoryImpl.java:152)
        at org.apache.calcite.sql.fun.SqlCaseOperator.inferTypeFromOperands(SqlCaseOperator.java:271)
        at org.apache.calcite.sql.fun.SqlCaseOperator.inferReturnType(SqlCaseOperator.java:218)
        at org.apache.calcite.rex.RexBuilder.deriveReturnType(RexBuilder.java:270)
        at org.apache.calcite.rex.RexBuilder.makeCall(RexBuilder.java:244)
        at org.apache.hadoop.hive.ql.optimizer.calcite.translator.RexNodeConverter.convert(RexNodeConverter.java:237)
        at org.apache.hadoop.hive.ql.optimizer.calcite.translator.RexNodeConverter.convert(RexNodeConverter.java:136)
        at org.apache.hadoop.hive.ql.parse.CalcitePlanner$CalcitePlannerAction.genSelectLogicalPlan(CalcitePlanner.java:3047)
        at org.apache.hadoop.hive.ql.parse.CalcitePlanner$CalcitePlannerAction.genLogicalPlan(CalcitePlanner.java:3173)
        at org.apache.hadoop.hive.ql.parse.CalcitePlanner$CalcitePlannerAction.apply(CalcitePlanner.java:947)
        at org.apache.hadoop.hive.ql.parse.CalcitePlanner$CalcitePlannerAction.apply(CalcitePlanner.java:901)
        at org.apache.calcite.tools.Frameworks$1.apply(Frameworks.java:113)
        at org.apache.calcite.prepare.CalcitePrepareImpl.perform(CalcitePrepareImpl.java:969)
        at org.apache.calcite.tools.Frameworks.withPrepare(Frameworks.java:149)
        at org.apache.calcite.tools.Frameworks.withPlanner(Frameworks.java:106)
        at org.apache.hadoop.hive.ql.parse.CalcitePlanner.getOptimizedAST(CalcitePlanner.java:719)
        at org.apache.hadoop.hive.ql.parse.CalcitePlanner.genOPTree(CalcitePlanner.java:287)
        at org.apache.hadoop.hive.ql.parse.SemanticAnalyzer.analyzeInternal(SemanticAnalyzer.java:10831)
        at org.apache.hadoop.hive.ql.parse.CalcitePlanner.analyzeInternal(CalcitePlanner.java:246)
        at org.apache.hadoop.hive.ql.parse.BaseSemanticAnalyzer.analyze(BaseSemanticAnalyzer.java:250)
        at org.apache.hadoop.hive.ql.Driver.compile(Driver.java:477)
        at org.apache.hadoop.hive.ql.Driver.compileInternal(Driver.java:1242)
        at org.apache.hadoop.hive.ql.Driver.runInternal(Driver.java:1376)
        at org.apache.hadoop.hive.ql.Driver.run(Driver.java:1171)
        at org.apache.hadoop.hive.ql.Driver.run(Driver.java:1161)
        at org.apache.hadoop.hive.cli.CliDriver.processLocalCmd(CliDriver.java:232)
        at org.apache.hadoop.hive.cli.CliDriver.processCmd(CliDriver.java:183)
        at org.apache.hadoop.hive.cli.CliDriver.processLine(CliDriver.java:399)
        at org.apache.hadoop.hive.cli.CliDriver.executeDriver(CliDriver.java:776)
        at org.apache.hadoop.hive.cli.CliDriver.run(CliDriver.java:714)
        at org.apache.hadoop.hive.cli.CliDriver.main(CliDriver.java:641)
        at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
        at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
        at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
        at java.lang.reflect.Method.invoke(Method.java:498)
        at org.apache.hadoop.util.RunJar.run(RunJar.java:221)
        at org.apache.hadoop.util.RunJar.main(RunJar.java:136)

运行的sql比较复杂,原先认为是UDF函数导致的,后来逐一排查后发现 原来是 select case when then一个map类型的数据 end 时报错

select
case when kv is not null and kv['spsug'] = 'ug' then kv 
end as kv 
from portal.gdm_sdk_app_event_kv_hour where day = 20180607 and type = 'action'
报如上错
 
select
case when kv is not null and kv['spsug'] = 'ug' then kv 
else NULL end as kv 
from portal.gdm_sdk_app_event_kv_hour where day = 20180607 and type = 'action'
正常

解决办法
加一个else NULL 解决

2.heap space

运行了一个简单的sql 报错

select * from EDM_USER_EXPOSURE_DETAIL_DAY where day='20180604' limit 1;
OK
Exception in thread "main" java.lang.OutOfMemoryError: Java heap space
        at java.util.Arrays.copyOf(Arrays.java:3236)
        at sun.misc.Resource.getBytes(Resource.java:117)
        at java.net.URLClassLoader.defineClass(URLClassLoader.java:462)
        at java.net.URLClassLoader.access$100(URLClassLoader.java:73)
        at java.net.URLClassLoader$1.run(URLClassLoader.java:368)
        at java.net.URLClassLoader$1.run(URLClassLoader.java:362)
        at java.security.AccessController.doPrivileged(Native Method)
        at java.net.URLClassLoader.findClass(URLClassLoader.java:361)
        at java.lang.ClassLoader.loadClass(ClassLoader.java:424)
        at sun.misc.Launcher$AppClassLoader.loadClass(Launcher.java:335)
        at java.lang.ClassLoader.loadClass(ClassLoader.java:357)
        at org.apache.hadoop.hive.common.FileUtils.deleteDirectory(FileUtils.java:777)
        at org.apache.hadoop.hive.ql.session.SessionState.close(SessionState.java:1525)
        at org.apache.hadoop.hive.cli.CliSessionState.close(CliSessionState.java:70)
        at org.apache.hadoop.hive.cli.CliDriver.run(CliDriver.java:717)
        at org.apache.hadoop.hive.cli.CliDriver.main(CliDriver.java:641)
        at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
        at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
        at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
        at java.lang.reflect.Method.invoke(Method.java:498)
        at org.apache.hadoop.util.RunJar.run(RunJar.java:221)
        at org.apache.hadoop.util.RunJar.main(RunJar.java:136)
Exception in thread "Thread-7" java.lang.OutOfMemoryError: GC overhead limit exceeded

Exception: java.lang.OutOfMemoryError thrown from the UncaughtExceptionHandler in thread "Thread-7"
Exception in thread "Thread-9" Exception in thread "Thread-1" java.lang.OutOfMemoryError: GC overhead limit exceeded
        at java.lang.String.toCharArray(String.java:2899)
        at java.util.zip.ZipCoder.getBytes(ZipCoder.java:78)
        at java.util.zip.ZipFile.getEntry(ZipFile.java:316)
        at java.util.jar.JarFile.getEntry(JarFile.java:240)
        at java.util.jar.JarFile.getJarEntry(JarFile.java:223)
        at sun.misc.URLClassPath$JarLoader.getResource(URLClassPath.java:1042)
        at sun.misc.URLClassPath.getResource(URLClassPath.java:239)
        at java.net.URLClassLoader$1.run(URLClassLoader.java:365)
        at java.net.URLClassLoader$1.run(URLClassLoader.java:362)
        at java.security.AccessController.doPrivileged(Native Method)
        at java.net.URLClassLoader.findClass(URLClassLoader.java:361)
        at java.lang.ClassLoader.loadClass(ClassLoader.java:424)
        at sun.misc.Launcher$AppClassLoader.loadClass(Launcher.java:335)
        at java.lang.ClassLoader.loadClass(ClassLoader.java:357)
        at jline.console.history.MemoryHistory.entries(MemoryHistory.java:152)
        at jline.console.history.MemoryHistory.entries(MemoryHistory.java:156)
        at jline.console.history.MemoryHistory.iterator(MemoryHistory.java:160)
        at jline.console.history.FileHistory.flush(FileHistory.java:89)
        at org.apache.hadoop.hive.cli.CliDriver$10.run(CliDriver.java:817)
        at java.lang.Thread.run(Thread.java:748)

解决办法

在/home/hadoop/hive_client/bin/ 的hive脚本增加一个
export HADOOP_CLIENT_OPTS="$HADOOP_CLIENT_OPTS -XX:NewRatio=12 -Xms10m -Xmx12288m -XX:MaxHeapFreeRatio=40 -XX:MinHeapFreeRatio=15 -XX:-UseGCOverheadLimit"

3.运行sql报错 说内存不够

31-07-2018 18:16:01 CST adm_user_channel_type_weekly INFO - Container [pid=5549,containerID=container_e56_1531715559020_414691_01_000758] is running beyond physical memory limits. Current usage: 4.6 GB of 4.5 GB physical memory used; 6.8 GB of 9.4 GB virtual memory used. Killing container.

31-07-2018 18:16:01 CST adm_user_channel_type_weekly INFO - Dump of the process-tree for container_e56_1531715559020_414691_01_000758 :

31-07-2018 18:16:01 CST adm_user_channel_type_weekly INFO - |- PID PPID PGRPID SESSID CMD_NAME USER_MODE_TIME(MILLIS) SYSTEM_TIME(MILLIS) VMEM_USAGE(BYTES) RSSMEM_USAGE(PAGES) FULL_CMD_LINE

31-07-2018 18:16:01 CST adm_user_channel_type_weekly INFO - |- 5556 5549 5549 5549 (java) 7750 1889 7146377216 1200563 /usr/jdk64/jdk1.8.0_77/bin/java -XX:+UseSerialGC -Djava.net.preferIPv4Stack=true -Dhadoop.metrics.log.level=WARN -Xmx5096M -Xms5096M -XX:PermSize=128M -Djava.io.tmpdir=/mnt/dfs/7/yarn/local/usercache/portal/appcache/application_1531715559020_414691/container_e56_1531715559020_414691_01_000758/tmp -Dlog4j.configuration=container-log4j.properties -Dyarn.app.container.log.dir=/mnt/dfs/11/yarn/logs/application_1531715559020_414691/container_e56_1531715559020_414691_01_000758 -Dyarn.app.container.log.filesize=0 -Dhadoop.root.logger=INFO,CLA -Dhadoop.root.logfile=syslog -Dyarn.app.mapreduce.shuffle.logger=INFO,shuffleCLA -Dyarn.app.mapreduce.shuffle.logfile=syslog.shuffle -Dyarn.app.mapreduce.shuffle.log.filesize=0 -Dyarn.app.mapreduce.shuffle.log.backups=0 org.apache.hadoop.mapred.YarnChild 10.196.76.56 45943 attempt_1531715559020_414691_r_000003_3 61572651156214

31-07-2018 18:16:01 CST adm_user_channel_type_weekly INFO - |- 5549 5547 5549 5549 (bash) 0 1 115847168 352 /bin/bash -c /usr/jdk64/jdk1.8.0_77/bin/java -XX:+UseSerialGC -Djava.net.preferIPv4Stack=true -Dhadoop.metrics.log.level=WARN -Xmx5096M -Xms5096M -XX:PermSize=128M -Djava.io.tmpdir=/mnt/dfs/7/yarn/local/usercache/portal/appcache/application_1531715559020_414691/container_e56_1531715559020_414691_01_000758/tmp -Dlog4j.configuration=container-log4j.properties -Dyarn.app.container.log.dir=/mnt/dfs/11/yarn/logs/application_1531715559020_414691/container_e56_1531715559020_414691_01_000758 -Dyarn.app.container.log.filesize=0 -Dhadoop.root.logger=INFO,CLA -Dhadoop.root.logfile=syslog -Dyarn.app.mapreduce.shuffle.logger=INFO,shuffleCLA -Dyarn.app.mapreduce.shuffle.logfile=syslog.shuffle -Dyarn.app.mapreduce.shuffle.log.filesize=0 -Dyarn.app.mapreduce.shuffle.log.backups=0 org.apache.hadoop.mapred.YarnChild 10.196.76.56 45943 attempt_1531715559020_414691_r_000003_3 61572651156214 1>/mnt/dfs/11/yarn/logs/application_1531715559020_414691/container_e56_1531715559020_414691_01_000758/stdout 2>/mnt/dfs/11/yarn/logs/application_1531715559020_414691/container_e56_1531715559020_414691_01_000758/stderr

31-07-2018 18:16:01 CST adm_user_channel_type_weekly INFO -

31-07-2018 18:16:01 CST adm_user_channel_type_weekly INFO - Container killed on request. Exit code is 143

31-07-2018 18:16:01 CST adm_user_channel_type_weekly INFO - Container exited with a non-zero exit code 143

31-07-2018 18:16:01 CST adm_user_channel_type_weekly INFO -

31-07-2018 18:16:01 CST adm_user_channel_type_weekly INFO -

31-07-2018 18:16:01 CST adm_user_channel_type_weekly INFO - FAILED: Execution Error, return code 2 from org.apache.hadoop.hive.ql.exec.mr.MapRedTask

31-07-2018 18:16:01 CST adm_user_channel_type_weekly INFO - MapReduce Jobs Launched:

31-07-2018 18:16:01 CST adm_user_channel_type_weekly INFO - Stage-Stage-1: Map: 401 Reduce: 347 Cumulative CPU: 14271.34 sec HDFS Read: 28256929290 HDFS Write: 334066 FAIL

31-07-2018 18:16:01 CST adm_user_channel_type_weekly INFO - Total MapReduce CPU Time Spent: 0 days 3 hours 57 minutes 51 seconds 340 msec

31-07-2018 18:16:01 CST adm_user_channel_type_weekly INFO - job error

31-07-2018 18:16:02 CST adm_user_channel_type_weekly INFO - Process completed unsuccessfully in 335 seconds.

31-07-2018 18:16:02 CST adm_user_channel_type_weekly ERROR - Job run failed!

java.lang.RuntimeException: azkaban.jobExecutor.utils.process.ProcessFailureException

at azkaban.jobExecutor.ProcessJob.run(ProcessJob.java:210)

at azkaban.execapp.JobRunner.runJob(JobRunner.java:744)

at azkaban.execapp.JobRunner.run(JobRunner.java:508)

at java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:511)

at java.util.concurrent.FutureTask.run(FutureTask.java:266)

at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)

at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)

at java.lang.Thread.run(Thread.java:748)

Caused by: azkaban.jobExecutor.utils.process.ProcessFailureException

at azkaban.jobExecutor.utils.process.AzkabanProcess.run(AzkabanProcess.java:138)

at azkaban.jobExecutor.ProcessJob.run(ProcessJob.java:203)

... 7 more

31-07-2018 18:16:02 CST adm_user_channel_type_weekly ERROR - azkaban.jobExecutor.utils.process.ProcessFailureException cause: azkaban.jobExecutor.utils.process.ProcessFailureException

31-07-2018 18:16:02 CST adm_user_channel_type_weekly INFO - Finishing job adm_user_channel_type_weekly attempt: 3 at 1533032162005 with status FAILED_RETRYING

解决办法

/home/hadoop/hadoop-client/etc/hadoop 的  mapred-site.xml文件 
  <property>
        <name>mapreduce.map.java.opts</name>
        <value>-Xmx6036M -XX:+UseSerialGC</value>
    </property>
    <property>
        <name>mapreduce.reduce.java.opts</name>
        <value>-Xmx8096M -Xms8096M -XX:PermSize=128M</value>
    </property>
    <property>
        <name>yarn.app.mapreduce.am.command-opts</name>
        <value>-Xmx2228M</value>
    </property>

4. 曝光点击次数比曝光点击人数小的问题

今天遇到一个问题 写了一个sql 结果得出来的次数比人数还要小
sql如下

select 
       case when prog['feedlist_refactor'] like '%T1534831577502%' then '新' else '旧' end
       ,count(distinct case when is_rcc='1' then concat(device_uuid,eid,content_id) end ) count(distinct case when is_rcc='1' then concat(device_uuid,eid,content_id) end ) rcc_pv
       ,count(distinct case when is_rcc='1' then device_uuid end ) rcc_uv
  from portal.edm_galaxy_reader_exposure_detail_day
 where day='20181223'
   and  app_id in ('2S5Wcx')
   and cast(split(app_version, '\\.')[0] as int) >= 48 
   and column_name ='讲讲::推荐'
 group by 
      case when prog['feedlist_refactor'] like '%T1534831577502%' then '新' else '旧' end;
      
结果如下
新      1535    2070
旧      18      36

咱们发现次数比人数少好多
经过一顿排查 后来发现
count(distinct case when is_rcc='1' then concat(device_uuid,eid,content_id) end ) 的content_id 有为NULL的。
发现

以下3列 concat(device_uuid,eid,content_id) 然后count (distinct ***)
1    1545547049489   A1
2    1545546798223   A1
3    1545580065025   NULL
4    1545574228116   A1
5    1545577264556   NULL
6    1545536330430   NULL
7    1545558735901   A1
8    1545580068366   NULL
9    1545508485924   NULL
10   1545574677628   A1

出来的数据是5....
为NULL的不会被计数上去

5.又一次内存溢出

sql很简单

set hive.map.aggr=true;


set hive.groupby.skewindata=true


set hive.input.format=org.apache.hadoop.hive.ql.io.CombineHiveInputFormat;


set hive.auto.convert.join=true;


set hive.merge.orcfile.stripe.level=false;


set hive.merge.mapfiles=true;


set hive.merge.mapredfiles=true;


set hive.mapred.mode=nonstrict;


set hive.merge.smallfiles.avgsize=975175680;


set mapred.max.split.size=1024000000;


set mapred.min.split.size.per.node=256000000;


set mapred.min.split.size.per.rack=256000000;


set hive.exec.max.created.files=1000000;


insert overwrite table portal.adm_reader_exposure_2_day partition (day='20190116')


select


   app_id,


   case


      when column_name ='头条' then '头条'


      when column_name like '讲讲::%' then '讲讲'


   else 'other' end as column_name ,


   count(distinct case when from_type='列表' then concat(device_uuid,eid,recommend_id) else null end) ,


   count(distinct case when from_type='列表' then device_uuid else null end) ,


   count(distinct case when from_type='列表' and is_rcc=1 then concat(device_uuid,eid,recommend_id) else null end) rcc_num,


   count(distinct case when from_type='列表' and is_rcc=1 then device_uuid else null end) rcc_uuid,


   count(distinct case when from_type='列表'and is_rcc=1 and rcc_type in (1,3) then concat(device_uuid,eid,recommend_id) else null end) rec_rcc_num,


   count(distinct case when from_type='列表'and is_rcc=1 and rcc_type in (1,3) then device_uuid else null end) rec_rcc_uuid,


   count(distinct case when from_type='列表'and is_rcc=1 and rcc_type in (2,3) then concat(device_uuid,eid,recommend_id) else null end) doc_rcc_num,


   count(distinct case when from_type='列表'and is_rcc=1 and rcc_type in (2,3) then device_uuid else null end) doc_rcc_uuid,


   count(distinct case when from_type='文章页' then concat(device_uuid,eid,recommend_id) else null end) ,


   count(distinct case when from_type='文章页' then device_uuid else null end) ,


    count(distinct case when from_type='文章页' and is_rcc=1 then concat(device_uuid,eid,recommend_id) else null end) ,


   count(distinct case when from_type='文章页' and is_rcc=1 then device_uuid else null end)


from portal.edm_galaxy_reader_exposure_detail_day


where day='20190116'


     and app_id in ('2x1kfBk63z', '2S5Wcx','RjOJS2')


     and (column_name ='头条' or column_name like '讲讲::%')


     group by


      app_id,


      case


         when column_name ='头条' then '头条'


         when column_name like '讲讲::%' then '讲讲'


         else 'other' end


union all


select


   app_id,


   '整体' column_name,


  count(distinct case when from_type='列表' then concat(device_uuid,eid,recommend_id) else null end) ,


   count(distinct case when from_type='列表' then device_uuid else null end) ,


   count(distinct case when from_type='列表' and is_rcc=1 then concat(device_uuid,eid,recommend_id) else null end) rcc_num,


   count(distinct case when from_type='列表' and is_rcc=1 then device_uuid else null end) rcc_uuid,


   count(distinct case when from_type='列表'and is_rcc=1 and rcc_type in (1,3) then concat(device_uuid,eid,recommend_id) else null end) rec_rcc_num,


   count(distinct case when from_type='列表'and is_rcc=1 and rcc_type in (1,3) then device_uuid else null end) rec_rcc_uuid,


   count(distinct case when from_type='列表'and is_rcc=1 and rcc_type in (2,3) then concat(device_uuid,eid,recommend_id) else null end) doc_rcc_num,


   count(distinct case when from_type='列表'and is_rcc=1 and rcc_type in (2,3) then device_uuid else null end) doc_rcc_uuid,


   count(distinct case when from_type='文章页' then concat(device_uuid,eid,recommend_id) else null end) ,


   count(distinct case when from_type='文章页' then device_uuid else null end) ,


    count(distinct case when from_type='文章页' and is_rcc=1 then concat(device_uuid,eid,recommend_id) else null end) ,


   count(distinct case when from_type='文章页' and is_rcc=1 then device_uuid else null end)


from portal.edm_galaxy_reader_exposure_detail_day


where day='20190116'


     and app_id in ('2x1kfBk63z', '2S5Wcx','RjOJS2')


     and (column_name ='头条' or column_name like '讲讲::%')


     group by


      app_id;

报如下错

Diagnostic Messages for this Task:


Error: org.apache.hadoop.mapreduce.task.reduce.Shuffle$ShuffleError: error in shuffle in fetcher#1


    at org.apache.hadoop.mapreduce.task.reduce.Shuffle.run(Shuffle.java:134)


    at org.apache.hadoop.mapred.ReduceTask.run(ReduceTask.java:376)


    at org.apache.hadoop.mapred.YarnChild$2.run(YarnChild.java:164)


    at java.security.AccessController.doPrivileged(Native Method)


    at javax.security.auth.Subject.doAs(Subject.java:422)


    at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1698)


    at org.apache.hadoop.mapred.YarnChild.main(YarnChild.java:158)


Caused by: java.lang.OutOfMemoryError: Java heap space


    at org.apache.hadoop.io.BoundedByteArrayOutputStream.<init>(BoundedByteArrayOutputStream.java:56)


    at org.apache.hadoop.io.BoundedByteArrayOutputStream.<init>(BoundedByteArrayOutputStream.java:46)


    at org.apache.hadoop.mapreduce.task.reduce.InMemoryMapOutput.<init>(InMemoryMapOutput.java:63)


    at org.apache.hadoop.mapreduce.task.reduce.MergeManagerImpl.unconditionalReserve(MergeManagerImpl.java:305)


    at org.apache.hadoop.mapreduce.task.reduce.MergeManagerImpl.reserve(MergeManagerImpl.java:295)


    at org.apache.hadoop.mapreduce.task.reduce.Fetcher.copyMapOutput(Fetcher.java:514)


    at org.apache.hadoop.mapreduce.task.reduce.Fetcher.copyFromHost(Fetcher.java:336)


    at org.apache.hadoop.mapreduce.task.reduce.Fetcher.run(Fetcher.java:193)




FAILED: Execution Error, return code 2 from org.apache.hadoop.hive.ql.exec.mr.MapRedTask

解决方案
根据《Hadoop:The Definitive Guide 4th Edition》所述(P203-219),map任务和reduce任务之间要经过一个shuffle过程,该过程复制map任务的输出作为reduce任务的输入
具体的来说,shuffle过程的输入是:map任务的输出文件,它的输出接收者是:运行reduce任务的机子上的内存buffer,并且shuffle过程以并行方式运行
参数mapreduce.reduce.shuffle.input.buffer.percent控制运行reduce任务的机子上多少比例的内存用作上述buffer(默认值为0.70),参数mapreduce.reduce.shuffle.parallelcopies控制shuffle过程的并行度(默认值为5)
那么"mapreduce.reduce.shuffle.input.buffer.percent" * "mapreduce.reduce.shuffle.parallelcopies" 必须小于等于1,否则就会出现如上错误
因此,我将mapreduce.reduce.shuffle.input.buffer.percent设置成值为0.1,就可以正常运行了(设置成0.2,还是会抛同样的错)

另外,可以发现如果使用两个参数的默认值,那么两者乘积为3.5,大大大于1了,为什么没有经常抛出以上的错误呢?
1)首先,把默认值设为比较大,主要是基于性能考虑,将它们设为比较大,可以大大加快从map复制数据的速度

2)其次,要抛出如上异常,还需满足另外一个条件,就是map任务的数据一下子准备好了等待shuffle去复制,在这种情况下,就会导致shuffle过程的“线程数量”和“内存buffer使用量”都是满负荷的值,自然就造成了内存不足的错误;而如果map任务的数据是断断续续完成的,那么没有一个时刻shuffle过程的“线程数量”和“内存buffer使用量”是满负荷值的,自然也就不会抛出如上错误

另外,如果在设置以上参数后,还是出现错误,那么有可能是运行Reduce任务的进程的内存总量不足,可以通过mapred.child.java.opts参数来调节,比如设置mapred.child.java.opts=-Xmx2024m!
准备加下试试

set mapreduce.map.memory.mb=4096;
        set mapreduce.reduce.memory.mb=7036;
        set mapreduce.map.java.opt=-Xmx3036M;
        set mapreduce.reduce.java.opts=-Xmx5048M;

6.DISTINCT on different columns not supported with skew in data

今天突然报错了

DISTINCT on different columns not supported with skew in data,

原因是我在hive优化的时候 添加了
hive.groupby.skewindata=true;
数据倾斜参数,设置为true的话,会将一个MR作业切分成两个,第一个MR作业中,map的结果会随机分发到reduce中,这样相同的key可能分发到不同的
reduce中,在reduce中做一些聚合操作,从而达到负载均衡的目的。第二个MR再按照正常的逻辑进行。
但是要注意的是这个参数不支持同时进行多列的count(distinct )操作。
我一看sql N个count distinct

7.动态分区执行job报错

Error: java.lang.RuntimeException: org.apache.hadoop.hive.ql.metadata.HiveException: Hive Runtime Error while processing row (tag=0) {"key":{},"value":{"_col0":"CQkzNDg1Y2E1YzAzMzg3OTRjCTRhMjIwNWEw","_col1":"vivo Y85A","_col2":"1080*2280","_col3":"CMCC","_col4":"WIFI","_col5":"94:63:72:95:E7:E1","_col6":null,"_col7":"869829049641386","_col8":"a","_col9":"8.1.0","_col10":"3485ca5c0338794c","_col11":"BZoK1k6nPQ/HtQWCOgthGGk2xSRTVofCjCPcRBAyqgw5XaP2tn7aQu1YOyWF1qjzJfeYb+hCDWEbekpqioxE+A==","_col12":"GGLFjIGnPNLoW2EZdFl5dAUesgMYzYmwFeZFwiaPKf8=","_col13":"zh","_col14":"中国标准时间","_col15":"CQkzNDg1Y2E1YzAzMzg3OTRjCTRhMjIwNWEw","_col16":"2x1kfBk63z","_col17":"55.5","_col18":"1","_col19":"1.6.1","_col20":"1026","_col21":"com.netease.newsreader.activity","_col22":"iiabgf1557086350158","_col23":"2019/05/06 03:59:11","_col24":"183.198.11.133","_col25":"中国","_col26":"河北","_col27":"邯郸","_col28":"2019-05-06 03:59:10","_col29":"2019-05-06 04:12:05","_col30":null,"_col31":"0","_col32":"vivo_store2014_news","_col33":"QQ_news_CPD1","_col34":{"PROG":"Rpic2","comment_hot_rank_enable":"1","docpage_related":"10","docpage_screenshot":"1","exit_recommend":"{\"period\":\"daily\"#@#\"count\":\"2\"}","feedback_ext":"{\"open\":0}","galaxy_default_priority_interval":"500","galaxy_immediately_sender_type":"2","galaxy_url":"http://m.analytics.126.net/news/c","icon_backgroundTrigger":"{\"time_iconshow\":1200#@#\"number_iconshow\":1#@#\"effect\":1}","keepLive_setting":"{\"keepLive\":[{\"type\":\"JobScheduler\"#@#\"interval\":60#@#\"enable\":true}#@#{\"type\":\"AccountSync\"#@#\"interval\":60}#@#{\"type\":\"OnePixel\"#@#\"enable\":true}]}","list_refreshsolution":"rocket_refresh","motif_rec_bottom":"1","motif_rec_top":"1","newslist_recommend_motif_card":"1","permitpopup_time":"3500","pusharticle":"newstyle4","recprog":"NORSLK","search_h5_url":"{\"title\":\"281\"#@#\"url\":\"https://wp.m.163.com/163/html/frontend/newsapp-search-v1/index.html\"}","share_config":"{\"share_path\":1}","tab_setting":"{\"tab\":[{\"id\":\"video\"#@#\"name\":\"视频\"}#@#{\"id\":\"discovery\"#@#\"name\":\"讲讲\"}]}","tie_ad_Yanxuan":"nochange","tie_hotlist":"hotlist_A","time_noticheck":"-1","video_autoplaycountdown":"0","video_cache":"0","video_related":"-5","videoplay":"newlist1","yuedujia_card":"1","yuedujia_card_up":"1","yuedujia_rec":"1"},"_col35":"app","_col36":{"":null},"_col37":{},"_col38":"ENTRYX","_col39":"","_col40":null,"_col41":"2019/05/06 04:04:57","_col42":"2019-05-06 04:04:57","_col43":null,"_col44":"头条","_col45":{"ENTRYX":"EEC0SF6005481NFY","column":"头条","tag":"详情页-back键返回"},"_col46":"1","_col47":0,"_col48":"27","_col49":"4a2205a0","_col50":"vivo","_col51":"vivo","_col52":"OPM1.171019.011 release-keys","_col53":"io9hQf3Tgyc5ECpGHaWdVG57McvxMr0r8/OeBWIiUJ44rU+oUyKmb2isn9dFzkjy","_col54":"2019050604","_col55":"function"}} at org.apache.hadoop.hive.ql.exec.mr.ExecReducer.reduce(ExecReducer.java:257) at org.apache.hadoop.mapred.ReduceTask.runOldReducer(ReduceTask.java:444) at org.apache.hadoop.mapred.ReduceTask.run(ReduceTask.java:392) at org.apache.hadoop.mapred.YarnChild$2.run(YarnChild.java:177) at java.security.AccessController.doPrivileged(Native Method) at javax.security.auth.Subject.doAs(Subject.java:422) at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1893) at org.apache.hadoop.mapred.YarnChild.main(YarnChild.java:171) Caused by: org.apache.hadoop.hive.ql.metadata.HiveException: Hive Runtime Error while processing row (tag=0) {"key":{},"value":{"_col0":"CQkzNDg1Y2E1YzAzMzg3OTRjCTRhMjIwNWEw","_col1":"vivo Y85A","_col2":"1080*2280","_col3":"CMCC","_col4":"WIFI","_col5":"94:63:72:95:E7:E1","_col6":null,"_col7":"869829049641386","_col8":"a","_col9":"8.1.0","_col10":"3485ca5c0338794c","_col11":"BZoK1k6nPQ/HtQWCOgthGGk2xSRTVofCjCPcRBAyqgw5XaP2tn7aQu1YOyWF1qjzJfeYb+hCDWEbekpqioxE+A==","_col12":"GGLFjIGnPNLoW2EZdFl5dAUesgMYzYmwFeZFwiaPKf8=","_col13":"zh","_col14":"中国标准时间","_col15":"CQkzNDg1Y2E1YzAzMzg3OTRjCTRhMjIwNWEw","_col16":"2x1kfBk63z","_col17":"55.5","_col18":"1","_col19":"1.6.1","_col20":"1026","_col21":"com.netease.newsreader.activity","_col22":"iiabgf1557086350158","_col23":"2019/05/06 03:59:11","_col24":"183.198.11.133","_col25":"中国","_col26":"河北","_col27":"邯郸","_col28":"2019-05-06 03:59:10","_col29":"2019-05-06 04:12:05","_col30":null,"_col31":"0","_col32":"vivo_store2014_news","_col33":"QQ_news_CPD1","_col34":{"PROG":"Rpic2","comment_hot_rank_enable":"1","docpage_related":"10","docpage_screenshot":"1","exit_recommend":"{\"period\":\"daily\"#@#\"count\":\"2\"}","feedback_ext":"{\"open\":0}","galaxy_default_priority_interval":"500","galaxy_immediately_sender_type":"2","galaxy_url":"http://m.analytics.126.net/news/c","icon_backgroundTrigger":"{\"time_iconshow\":1200#@#\"number_iconshow\":1#@#\"effect\":1}","keepLive_setting":"{\"keepLive\":[{\"type\":\"JobScheduler\"#@#\"interval\":60#@#\"enable\":true}#@#{\"type\":\"AccountSync\"#@#\"interval\":60}#@#{\"type\":\"OnePixel\"#@#\"enable\":true}]}","list_refreshsolution":"rocket_refresh","motif_rec_bottom":"1","motif_rec_top":"1","newslist_recommend_motif_card":"1","permitpopup_time":"3500","pusharticle":"newstyle4","recprog":"NORSLK","search_h5_url":"{\"title\":\"281\"#@#\"url\":\"https://wp.m.163.com/163/html/frontend/newsapp-search-v1/index.html\"}","share_config":"{\"share_path\":1}","tab_setting":"{\"tab\":[{\"id\":\"video\"#@#\"name\":\"视频\"}#@#{\"id\":\"discovery\"#@#\"name\":\"讲讲\"}]}","tie_ad_Yanxuan":"nochange","tie_hotlist":"hotlist_A","time_noticheck":"-1","video_autoplaycountdown":"0","video_cache":"0","video_related":"-5","videoplay":"newlist1","yuedujia_card":"1","yuedujia_card_up":"1","yuedujia_rec":"1"},"_col35":"app","_col36":{"":null},"_col37":{},"_col38":"ENTRYX","_col39":"","_col40":null,"_col41":"2019/05/06 04:04:57","_col42":"2019-05-06 04:04:57","_col43":null,"_col44":"头条","_col45":{"ENTRYX":"EEC0SF6005481NFY","column":"头条","tag":"详情页-back键返回"},"_col46":"1","_col47":0,"_col48":"27","_col49":"4a2205a0","_col50":"vivo","_col51":"vivo","_col52":"OPM1.171019.011 release-keys","_col53":"io9hQf3Tgyc5ECpGHaWdVG57McvxMr0r8/OeBWIiUJ44rU+oUyKmb2isn9dFzkjy","_col54":"2019050604","_col55":"function"}} at org.apache.hadoop.hive.ql.exec.mr.ExecReducer.reduce(ExecReducer.java:245) ... 7 more Caused by: org.apache.hadoop.hive.ql.metadata.HiveFatalException: [Error 20004]: Fatal error occurred when node tried to create too many dynamic partitions. The maximum number of dynamic partitions is controlled by hive.exec.max.dynamic.partitions and hive.exec.max.dynamic.partitions.pernode. Maximum was set to 100 partitions per node, number of dynamic partitions on this node: 101 at org.apache.hadoop.hive.ql.exec.FileSinkOperator.getDynOutPaths(FileSinkOperator.java:933) at org.apache.hadoop.hive.ql.exec.FileSinkOperator.process(FileSinkOperator.java:704) at org.apache.hadoop.hive.ql.exec.Operator.forward(Operator.java:879) at org.apache.hadoop.hive.ql.exec.SelectOperator.process(SelectOperator.java:95) at org.apache.hadoop.hive.ql.exec.mr.ExecReducer.reduce(ExecReducer.java:236) ... 7 more 

原因是我用的动态分区,分小时,而且再分type类别,导致分区数过少 添加如下即可

set hive.exec.max.dynamic.partitions.pernode=500;
set hive.exec.max.dynamic.partitions=3000;

8.<> 不等号对int类型的应用

今天同事些了一个sql 查询的是type<>'' and id='axxx' 结果没查到。查看type的值是有数据....不过是int类型。后来再用
case (type as string) <>'' and id='axxx' 能查到。原来 当类型是int类型是 type<>''是查不出来的

9.hive的hdfs目录lzo文件大小有为空的情况 导致的错误

2019-07-11 08:14:00,210 INFO [main] org.apache.hadoop.io.compress.CodecPool: Got brand-new decompressor [.lzo]
2019-07-11 08:14:00,219 WARN [main] org.apache.hadoop.mapred.YarnChild: Exception running child : java.io.IOException: java.io.EOFException: Premature EOF from inputStream
    at org.apache.hadoop.hive.io.HiveIOExceptionHandlerChain.handleRecordReaderCreationException(HiveIOExceptionHandlerChain.java:97)
    at org.apache.hadoop.hive.io.HiveIOExceptionHandlerUtil.handleRecordReaderCreationException(HiveIOExceptionHandlerUtil.java:57)
    at org.apache.hadoop.hive.ql.io.HiveInputFormat.getRecordReader(HiveInputFormat.java:300)
    at org.apache.hadoop.hive.ql.io.CombineHiveInputFormat.getRecordReader(CombineHiveInputFormat.java:706)
    at org.apache.hadoop.mapred.MapTask$TrackedRecordReader.<init>(MapTask.java:169)
    at org.apache.hadoop.mapred.MapTask.runOldMapper(MapTask.java:438)
    at org.apache.hadoop.mapred.MapTask.run(MapTask.java:343)
    at org.apache.hadoop.mapred.YarnChild$2.run(YarnChild.java:177)
    at java.security.AccessController.doPrivileged(Native Method)
    at javax.security.auth.Subject.doAs(Subject.java:422)
    at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1893)
    at org.apache.hadoop.mapred.YarnChild.main(YarnChild.java:171)
Caused by: java.io.EOFException: Premature EOF from inputStream
    at com.hadoop.compression.lzo.LzopInputStream.readFully(LzopInputStream.java:74)
    at com.hadoop.compression.lzo.LzopInputStream.readHeader(LzopInputStream.java:115)
    at com.hadoop.compression.lzo.LzopInputStream.<init>(LzopInputStream.java:54)
    at com.hadoop.compression.lzo.LzopCodec.createInputStream(LzopCodec.java:111)
    at com.hadoop.compression.lzo.LzopCodec.createInputStream(LzopCodec.java:149)
    at com.hadoop.mapred.DeprecatedLzoLineRecordReader.<init>(DeprecatedLzoLineRecordReader.java:59)
    at com.hadoop.mapred.DeprecatedLzoTextInputFormat.getRecordReader(DeprecatedLzoTextInputFormat.java:156)
    at org.apache.hadoop.hive.ql.io.HiveInputFormat.getRecordReader(HiveInputFormat.java:297)
    ... 9 more

解决
删除hive表的hdfs目录的lzo文件为0的即可。

10.