flink常见算子的一些操作

常见Transformation操作
map和filter

/*** 数据源:1 2 3 4 5.....源源不断过来* 通过map打印一下接受到数据* 通过filter过滤一下数据,我们只需要偶数*/
public class MapDemo {public static void main(String[] args) throws Exception {StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();DataStreamSource<Long> numberStream = env.addSource(new MyNoParalleSource()).setParallelism(1);SingleOutputStreamOperator<Long> dataStream = numberStream.map(new MapFunction<Long, Long>() {@Overridepublic Long map(Long value) throws Exception {System.out.println("接受到了数据:"+value);return value;}});SingleOutputStreamOperator<Long> filterDataStream = dataStream.filter(new FilterFunction<Long>() {@Overridepublic boolean filter(Long number) throws Exception {return number % 2 == 0;}});filterDataStream.print().setParallelism(1);env.execute("StreamingDemoWithMyNoPralalleSource");}}

flatMap,keyBy和sum

/**
* 滑动窗口实现单词计数
* 数据源:socket
* 需求:每隔1秒计算最近2秒单词出现的次数
*
* 练习算子:
* flatMap
* keyBy:
*    dataStream.keyBy("someKey") // 指定对象中的 "someKey"字段作为分组key
*    dataStream.keyBy(0) //指定Tuple中的第一个元素作为分组key
* sum
*/
public class WindowWordCountJava {public static void main(String[] args) throws  Exception {int port;try{ParameterTool parameterTool = ParameterTool.fromArgs(args);port = parameterTool.getInt("port");}catch (Exception e){System.err.println("no port set,user default port 9988");port=9988;}//步骤一:获取flink运行环境(stream)StreamExecutionEnvironment env= StreamExecutionEnvironment.getExecutionEnvironment();String hostname="10.126.88.226";String delimiter="\n";//步骤二:获取数据源DataStreamSource<String> textStream = env.socketTextStream(hostname, port, delimiter);//步骤三:执行transformation操作SingleOutputStreamOperator<WordCount> wordCountStream = textStream.flatMap(new FlatMapFunction<String, WordCount>() {public void flatMap(String line, Collector<WordCount> out) throws Exception {String[] fields = line.split("\t");for (String word : fields) {out.collect(new WordCount(word, 1L));}}}).keyBy("word").timeWindow(Time.seconds(2), Time.seconds(1))//每隔1秒计算最近2秒.sum("count");wordCountStream.print().setParallelism(1);//打印并设置并行度//步骤四:运行程序env.execute("socket word count");}public static class WordCount{public String word;public long count;public WordCount(){}public WordCount(String word,long count){this.word=word;this.count=count;}@Overridepublic String toString() {return "WordCount{" +"word='" + word + '\'' +", count=" + count +'}';}}
}

union

/*** 合并多个流,新的流会包含所有流中的数据,但是union是一个限制,就是所有合并的流类型必须是一致的*/
public class unionDemo {public static void main(String[] args) throws Exception {//获取Flink的运行环境StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();//获取数据源DataStreamSource<Long> text1 = env.addSource(new MyNoParalleSource()).setParallelism(1);//注意:针对此source,并行度只能设置为1DataStreamSource<Long> text2 = env.addSource(new MyNoParalleSource()).setParallelism(1);//把text1和text2组装到一起DataStream<Long> text = text1.union(text2);DataStream<Long> num = text.map(new MapFunction<Long, Long>() {@Overridepublic Long map(Long value) throws Exception {System.out.println("原始接收到数据:" + value);return value;}});//每2秒钟处理一次数据DataStream<Long> sum = num.timeWindowAll(Time.seconds(2)).sum(0);//打印结果sum.print().setParallelism(1);String jobName = unionDemo.class.getSimpleName();env.execute(jobName);}
}

connect,conMap和conFlatMap

/*** 和union类似,但是只能连接两个流,两个流的数据类型可以不同,会对两个流中的数据应用不同的处理方法*/
public class ConnectionDemo {public static void main(String[] args) throws Exception {//获取Flink的运行环境StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();//获取数据源DataStreamSource<Long> text1 = env.addSource(new MyNoParalleSource()).setParallelism(1);//注意:针对此source,并行度只能设置为1DataStreamSource<Long> text2 = env.addSource(new MyNoParalleSource()).setParallelism(1);SingleOutputStreamOperator<String> text2_str = text2.map(new MapFunction<Long, String>() {@Overridepublic String map(Long value) throws Exception {return "str_" + value;}});ConnectedStreams<Long, String> connectStream = text1.connect(text2_str);SingleOutputStreamOperator<Object> result = connectStream.map(new CoMapFunction<Long, String, Object>() {@Overridepublic Object map1(Long value) throws Exception {return value;}@Overridepublic Object map2(String value) throws Exception {return value;}});//打印结果result.print().setParallelism(1);String jobName = ConnectionDemo.class.getSimpleName();env.execute(jobName);}
}

Split和Select

/***  根据规则把一个数据流切分为多个流应用场景:* 可能在实际工作中,源数据流中混合了多种类似的数据,多种类型的数据处理规则不一样,所以就可以在根据一定的规则,* 把一个数据流切分成多个数据流,这样每个数据流就可以使用不用的处理逻辑了*/
public class SplitDemo {public static void main(String[] args) throws  Exception {//获取Flink的运行环境StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();//获取数据源DataStreamSource<Long> text = env.addSource(new MyNoParalleSource()).setParallelism(1);//注意:针对此source,并行度只能设置为1//对流进行切分,按照数据的奇偶性进行区分SplitStream<Long> splitStream = text.split(new OutputSelector<Long>() {@Overridepublic Iterable<String> select(Long value) {ArrayList<String> outPut = new ArrayList<>();if (value % 2 == 0) {outPut.add("even");//偶数} else {outPut.add("odd");//奇数}return outPut;}});//选择一个或者多个切分后的流DataStream<Long> evenStream = splitStream.select("even");DataStream<Long> oddStream = splitStream.select("odd");DataStream<Long> moreStream = splitStream.select("odd","even");//打印结果evenStream.print().setParallelism(1);String jobName = SplitDemo.class.getSimpleName();env.execute(jobName);}
}

常见sink操作
print() / printToErr()
打印每个元素的toString()方法的值到标准输出或者标准错误输出流中
writeAsText()

/*** 数据源:1 2 3 4 5.....源源不断过来* 通过map打印一下接受到数据* 通过filter过滤一下数据,我们只需要偶数*/
public class WriteTextDemo {public static void main(String[] args) throws Exception {StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();DataStreamSource<Long> numberStream = env.addSource(new MyNoParalleSource()).setParallelism(1);SingleOutputStreamOperator<Long> dataStream = numberStream.map(new MapFunction<Long, Long>() {@Overridepublic Long map(Long value) throws Exception {System.out.println("接受到了数据:"+value);return value;}});SingleOutputStreamOperator<Long> filterDataStream = dataStream.filter(new FilterFunction<Long>() {@Overridepublic boolean filter(Long number) throws Exception {return number % 2 == 0;}});filterDataStream.writeAsText("D:\\kkb\\flinklesson\\src\\output\\test").setParallelism(1);env.execute("StreamingDemoWithMyNoPralalleSource");}
}

自定义sink

<dependency><groupId>org.apache.bahir</groupId><artifactId>flink-connector-redis_2.11</artifactId><version>1.0</version></dependency>

自定义redis sink

/*** 把数据写入redis*/
public class SinkForRedisDemo {public static void main(String[] args) throws  Exception {StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();DataStreamSource<String> text = env.socketTextStream("hadoop100", 9000, "\n");//lpsuh l_words word//对数据进行组装,把string转化为tuple2<String,String>DataStream<Tuple2<String, String>> l_wordsData = text.map(new MapFunction<String, Tuple2<String, String>>() {@Overridepublic Tuple2<String, String> map(String value) throws Exception {return new Tuple2<>("l_words", value);}});//创建redis的配置FlinkJedisPoolConfig conf = new FlinkJedisPoolConfig.Builder().setHost("hadoop110").setPort(6379).build();//创建redissinkRedisSink<Tuple2<String, String>> redisSink = new RedisSink<>(conf, new MyRedisMapper());l_wordsData.addSink(redisSink);env.execute("StreamingDemoToRedis");}public static class MyRedisMapper implements RedisMapper<Tuple2<String, String>> {//表示从接收的数据中获取需要操作的redis key@Overridepublic String getKeyFromData(Tuple2<String, String> data) {return data.f0;}//表示从接收的数据中获取需要操作的redis value@Overridepublic String getValueFromData(Tuple2<String, String> data) {return data.f1;}@Overridepublic RedisCommandDescription getCommandDescription() {return new RedisCommandDescription(RedisCommand.LPUSH);}}
}

DataSet算子操作(Sparkcore)

source

基于文件
readTextFile(path)
基于集合
fromCollection(Collection)

transform

Map:输入一个元素,然后返回一个元素,中间可以做一些清洗转换等操作
FlatMap:输入一个元素,可以返回零个,一个或者多个元素
MapPartition>:类似map,一次处理一个分区的数据【如果在进行map处理的时候需要获取第三方资源链接,建议使用MapPartition】
Filter:过滤函数,对传入的数据进行判断,符合条件的数据会被留下
Reduce:对数据进行聚合操作,结合当前元素和上一次reduce返回的值进行聚合操作,然后返回一个新的值
Aggregate:sum、max、min等
Distinct:返回一个数据集中去重之后的元素,data.distinct()
Join:内连接
OuterJoin:外链接

Cross:获取两个数据集的笛卡尔积
Union:返回两个数据集的总和,数据类型需要一致
First-n:获取集合中的前N个元素
Sort Partition:在本地对数据集的所有分区进行排序,通过sortPartition()的链接调用来完成对多个字段的排序

MapPartition
public class MapPartitionDemo {public static void main(String[] args) throws Exception{//获取运行环境ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();ArrayList<String> data = new ArrayList<>();data.add("hello you");data.add("hello me");DataSource<String> text = env.fromCollection(data);/*text.map(new MapFunction<String, String>() {@Overridepublic String map(String value) throws Exception {//获取数据库连接--注意,此时是每过来一条数据就获取一次链接//处理数据//关闭连接return value;}});*/DataSet<String> mapPartitionData = text.mapPartition(new MapPartitionFunction<String, String>() {@Overridepublic void mapPartition(Iterable<String> values, Collector<String> out) throws Exception {//获取数据库连接--注意,此时是一个分区的数据获取一次连接【优点,每个分区获取一次链接】//values中保存了一个分区的数据//处理数据Iterator<String> it = values.iterator();while (it.hasNext()) {String next = it.next();String[] split = next.split("\\W+");for (String word : split) {out.collect(word);}}//关闭链接}});mapPartitionData.print();}
}

distinct

/*** 对数据进行去重*/
public class DistinctDemo {public static void main(String[] args) throws Exception{//获取运行环境ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();ArrayList<String> data = new ArrayList<>();data.add("you jump");data.add("i jump");DataSource<String> text = env.fromCollection(data);FlatMapOperator<String, String> flatMapData = text.flatMap(new FlatMapFunction<String, String>() {@Overridepublic void flatMap(String value, Collector<String> out) throws Exception {String[] split = value.toLowerCase().split("\\W+");for (String word : split) {System.out.println("单词:"+word);out.collect(word);}}});flatMapData.distinct()// 对数据进行整体去重.print();}}

join

/*** 对数据进行join*/
public class JoinDemo {public static void main(String[] args) throws Exception{//获取运行环境ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();//tuple2<用户id,用户姓名>ArrayList<Tuple2<Integer, String>> data1 = new ArrayList<>();data1.add(new Tuple2<>(1,"zs"));data1.add(new Tuple2<>(2,"ls"));data1.add(new Tuple2<>(3,"ww"));//tuple2<用户id,用户所在城市>ArrayList<Tuple2<Integer, String>> data2 = new ArrayList<>();data2.add(new Tuple2<>(1,"beijing"));data2.add(new Tuple2<>(2,"shanghai"));data2.add(new Tuple2<>(3,"guangzhou"));DataSource<Tuple2<Integer, String>> text1 = env.fromCollection(data1);DataSource<Tuple2<Integer, String>> text2 = env.fromCollection(data2);text1.join(text2).where(0)//指定第一个数据集中需要进行比较的元素角标.equalTo(0)//指定第二个数据集中需要进行比较的元素角标.with(new JoinFunction<Tuple2<Integer,String>, Tuple2<Integer,String>, Tuple3<Integer,String,String>>() {@Overridepublic Tuple3<Integer, String, String> join(Tuple2<Integer, String> first, Tuple2<Integer, String> second)throws Exception {return new Tuple3<>(first.f0,first.f1,second.f1);}}).print();System.out.println("==================================");//注意,这里用map和上面使用的with最终效果是一致的。/*text1.join(text2).where(0)//指定第一个数据集中需要进行比较的元素角标.equalTo(0)//指定第二个数据集中需要进行比较的元素角标.map(new MapFunction<Tuple2<Tuple2<Integer,String>,Tuple2<Integer,String>>, Tuple3<Integer,String,String>>() {@Overridepublic Tuple3<Integer, String, String> map(Tuple2<Tuple2<Integer, String>, Tuple2<Integer, String>> value) throws Exception {return new Tuple3<>(value.f0.f0,value.f0.f1,value.f1.f1);}}).print();*/}}

OutJoin

/*** 外连接:*      左外连接*      右外连接*      全外连接*/
public class OuterJoinDemo {public static void main(String[] args) throws Exception{//获取运行环境ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();//tuple2<用户id,用户姓名>ArrayList<Tuple2<Integer, String>> data1 = new ArrayList<>();data1.add(new Tuple2<>(1,"zs"));data1.add(new Tuple2<>(2,"ls"));data1.add(new Tuple2<>(3,"ww"));//tuple2<用户id,用户所在城市>ArrayList<Tuple2<Integer, String>> data2 = new ArrayList<>();data2.add(new Tuple2<>(1,"beijing"));data2.add(new Tuple2<>(2,"shanghai"));data2.add(new Tuple2<>(4,"guangzhou"));DataSource<Tuple2<Integer, String>> text1 = env.fromCollection(data1);DataSource<Tuple2<Integer, String>> text2 = env.fromCollection(data2);/*** 左外连接** 注意:second这个tuple中的元素可能为null**/text1.leftOuterJoin(text2).where(0).equalTo(0).with(new JoinFunction<Tuple2<Integer,String>, Tuple2<Integer,String>, Tuple3<Integer,String,String>>() {@Overridepublic Tuple3<Integer, String, String> join(Tuple2<Integer, String> first, Tuple2<Integer, String> second) throws Exception {if(second==null){return new Tuple3<>(first.f0,first.f1,"null");}else{return new Tuple3<>(first.f0,first.f1,second.f1);}}}).print();System.out.println("=============================================================================");/*** 右外连接** 注意:first这个tuple中的数据可能为null**/text1.rightOuterJoin(text2).where(0).equalTo(0).with(new JoinFunction<Tuple2<Integer,String>, Tuple2<Integer,String>, Tuple3<Integer,String,String>>() {@Overridepublic Tuple3<Integer, String, String> join(Tuple2<Integer, String> first, Tuple2<Integer, String> second) throws Exception {if(first==null){return new Tuple3<>(second.f0,"null",second.f1);}return new Tuple3<>(first.f0,first.f1,second.f1);}}).print();System.out.println("=============================================================================");/*** 全外连接** 注意:first和second这两个tuple都有可能为null**/text1.fullOuterJoin(text2).where(0).equalTo(0).with(new JoinFunction<Tuple2<Integer,String>, Tuple2<Integer,String>, Tuple3<Integer,String,String>>() {@Overridepublic Tuple3<Integer, String, String> join(Tuple2<Integer, String> first, Tuple2<Integer, String> second) throws Exception {if(first==null){return new Tuple3<>(second.f0,"null",second.f1);}else if(second == null){return new Tuple3<>(first.f0,first.f1,"null");}else{return new Tuple3<>(first.f0,first.f1,second.f1);}}}).print();}
}

Cross

/*** 笛卡尔积*/
public class CrossDemo {public static void main(String[] args) throws Exception{//获取运行环境ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();//tuple2<用户id,用户姓名>ArrayList<String> data1 = new ArrayList<>();data1.add("zs");data1.add("ww");//tuple2<用户id,用户所在城市>ArrayList<Integer> data2 = new ArrayList<>();data2.add(1);data2.add(2);DataSource<String> text1 = env.fromCollection(data1);DataSource<Integer> text2 = env.fromCollection(data2);CrossOperator.DefaultCross<String, Integer> cross = text1.cross(text2);cross.print();}
}

First-n 和 SortPartition

/*** TopN*/
import java.util.ArrayList;public class FirstNDemo {public static void main(String[] args) throws Exception{//获取运行环境ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();ArrayList<Tuple2<Integer, String>> data = new ArrayList<>();data.add(new Tuple2<>(2,"zs"));data.add(new Tuple2<>(4,"ls"));data.add(new Tuple2<>(3,"ww"));data.add(new Tuple2<>(1,"xw"));data.add(new Tuple2<>(1,"aw"));data.add(new Tuple2<>(1,"mw"));DataSource<Tuple2<Integer, String>> text = env.fromCollection(data);//获取前3条数据,按照数据插入的顺序text.first(3).print();System.out.println("==============================");//根据数据中的第一列进行分组,获取每组的前2个元素text.groupBy(0).first(2).print();System.out.println("==============================");//根据数据中的第一列分组,再根据第二列进行组内排序[升序],获取每组的前2个元素text.groupBy(0).sortGroup(1, Order.ASCENDING).first(2).print();System.out.println("==============================");//不分组,全局排序获取集合中的前3个元素,针对第一个元素升序,第二个元素倒序text.sortPartition(0,Order.ASCENDING).sortPartition(1,Order.DESCENDING).first(3).print();}
}

partition

/*** HashPartition** RangePartition*/
public class HashRangePartitionDemo {public static void main(String[] args) throws Exception{//获取运行环境ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();ArrayList<Tuple2<Integer, String>> data = new ArrayList<>();data.add(new Tuple2<>(1,"hello1"));data.add(new Tuple2<>(2,"hello2"));data.add(new Tuple2<>(2,"hello3"));data.add(new Tuple2<>(3,"hello4"));data.add(new Tuple2<>(3,"hello5"));data.add(new Tuple2<>(3,"hello6"));data.add(new Tuple2<>(4,"hello7"));data.add(new Tuple2<>(4,"hello8"));data.add(new Tuple2<>(4,"hello9"));data.add(new Tuple2<>(4,"hello10"));data.add(new Tuple2<>(5,"hello11"));data.add(new Tuple2<>(5,"hello12"));data.add(new Tuple2<>(5,"hello13"));data.add(new Tuple2<>(5,"hello14"));data.add(new Tuple2<>(5,"hello15"));data.add(new Tuple2<>(6,"hello16"));data.add(new Tuple2<>(6,"hello17"));data.add(new Tuple2<>(6,"hello18"));data.add(new Tuple2<>(6,"hello19"));data.add(new Tuple2<>(6,"hello20"));data.add(new Tuple2<>(6,"hello21"));DataSource<Tuple2<Integer, String>> text = env.fromCollection(data);/*text.partitionByHash(0).mapPartition(new MapPartitionFunction<Tuple2<Integer,String>, Tuple2<Integer,String>>() {@Overridepublic void mapPartition(Iterable<Tuple2<Integer, String>> values, Collector<Tuple2<Integer, String>> out) throws Exception {Iterator<Tuple2<Integer, String>> it = values.iterator();while (it.hasNext()){Tuple2<Integer, String> next = it.next();System.out.println("当前线程id:"+Thread.currentThread().getId()+","+next);}}}).print();*/text.partitionByRange(0).mapPartition(new MapPartitionFunction<Tuple2<Integer,String>, Tuple2<Integer,String>>() {@Overridepublic void mapPartition(Iterable<Tuple2<Integer, String>> values, Collector<Tuple2<Integer, String>> out) throws Exception {Iterator<Tuple2<Integer, String>> it = values.iterator();while (it.hasNext()){Tuple2<Integer, String> next = it.next();System.out.println("当前线程id:"+Thread.currentThread().getId()+","+next);}}}).print();}
}
/*** broadcast广播变量* */
public class BroadCastDemo {public static void main(String[] args) throws Exception{//获取运行环境ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();//1:准备需要广播的数据ArrayList<Tuple2<String, Integer>> broadData = new ArrayList<>();broadData.add(new Tuple2<>("zs",18));broadData.add(new Tuple2<>("ls",20));broadData.add(new Tuple2<>("ww",17));DataSet<Tuple2<String, Integer>> tupleData = env.fromCollection(broadData);//1.1:处理需要广播的数据,把数据集转换成map类型,map中的key就是用户姓名,value就是用户年龄DataSet<HashMap<String, Integer>> toBroadcast = tupleData.map(new MapFunction<Tuple2<String, Integer>, HashMap<String, Integer>>() {@Overridepublic HashMap<String, Integer> map(Tuple2<String, Integer> value) throws Exception {HashMap<String, Integer> res = new HashMap<>();res.put(value.f0, value.f1);return res;}});//源数据DataSource<String> data = env.fromElements("zs", "ls", "ww");//注意:在这里需要使用到RichMapFunction获取广播变量DataSet<String> result = data.map(new RichMapFunction<String, String>() {List<HashMap<String, Integer>> broadCastMap = new ArrayList<HashMap<String, Integer>>();HashMap<String, Integer> allMap = new HashMap<String, Integer>();/*** 这个方法只会执行一次* 可以在这里实现一些初始化的功能* 所以,就可以在open方法中获取广播变量数据*/@Overridepublic void open(Configuration parameters) throws Exception {super.open(parameters);//3:获取广播数据this.broadCastMap = getRuntimeContext().getBroadcastVariable("broadCastMapName");for (HashMap map : broadCastMap) {allMap.putAll(map);}}@Overridepublic String map(String value) throws Exception {Integer age = allMap.get(value);return value + "," + age;}}).withBroadcastSet(toBroadcast, "broadCastMapName");//2:执行广播数据的操作result.print();}}
/*** 计数器*/
public class CounterDemo {public static void main(String[] args) throws Exception{//获取运行环境ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();DataSource<String> data = env.fromElements("a", "b", "c", "d");DataSet<String> result = data.map(new RichMapFunction<String, String>() {//1:创建累加器private IntCounter numLines = new IntCounter();@Overridepublic void open(Configuration parameters) throws Exception {super.open(parameters);//2:注册累加器getRuntimeContext().addAccumulator("num-lines",this.numLines);}//int sum = 0;@Overridepublic String map(String value) throws Exception {//如果并行度为1,使用普通的累加求和即可,但是设置多个并行度,则普通的累加求和结果就不准了//sum++;//System.out.println("sum:"+sum);this.numLines.add(1);return value;}}).setParallelism(8);//如果要获取counter的值,只能是任务//result.print();result.writeAsText("d:\\data\\mycounter");JobExecutionResult jobResult = env.execute("counter");//3:获取累加器int num = jobResult.getAccumulatorResult("num-lines");System.out.println("num:"+num);}
}

本文来自互联网用户投稿,该文观点仅代表作者本人,不代表本站立场。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如若转载,请注明出处:http://www.mzph.cn/news/508772.shtml

如若内容造成侵权/违法违规/事实不符,请联系多彩编程网进行投诉反馈email:809451989@qq.com,一经查实,立即删除!

相关文章

flink的watermark参考配置

需求描述&#xff1a;每隔5秒&#xff0c;计算最近10秒单词出现的次数。 TimeWindow实现 /*** 每隔5秒计算最近10秒单词出现的次数*/ public class TimeWindowWordCount {public static void main(String[] args) throws Exception{StreamExecutionEnvironment env StreamExe…

hbase常见处理方式

相关依赖 <dependencies><dependency><groupId>org.apache.hbase</groupId><artifactId>hbase-client</artifactId><version>1.2.1</version></dependency><dependency><groupId>org.apache.hbase</gro…

flink连接kafka整合hbase,scala

解析kafka当中的json格式的数据&#xff0c;入hbase import java.util.Propertiesimport com.alibaba.fastjson.{JSON, JSONObject} import org.apache.flink.api.common.serialization.SimpleStringSchema import org.apache.flink.contrib.streaming.state.RocksDBStateBack…

sparkStreaming连接kafka整合hbase和redis

sparkStreaming消费kafka数据&#xff0c;并将数据保存到redis和hbase当中去&#xff0c;实现实时 import org.apache.hadoop.hbase.client.{Admin, Connection} import org.apache.hadoop.hbase.{HColumnDescriptor, HTableDescriptor, TableName} import org.apache.kafka.c…

sparksql一些指标

统计指标 select substr(tb.begin_address_code , 1 ,4) as begin_address_code , count(distinct vehicle_license) as dayVehicleCount from (select begin_address_code , vehicle_license from order where date_format(create_time , yyyy-MM-dd) 2020-02-15 ) tb grou…

sparkConf常见参数设置

def getSparkConf():SparkConf {val sparkConf: SparkConf new SparkConf().set("spark.driver.cores","4") //设置driver的CPU核数.set("spark.driver.maxResultSize","2g") //设置driver端结果存放的最大容量&#xff0c;这里设置…

sparkSession常见参数设置

def getSparkSession(sparkConf:SparkConf):SparkSession {val sparkSession: SparkSession SparkSession.builder().config(sparkConf)//调度模式.config("spark.scheduler.mode", "FAIR").config("spark.executor.memoryOverhead", "51…

关于kafka中acks是否可以为all

kafka源码中有这样一段代码&#xff1a; org.apache.kafka.clients.producer.KafkaProducer private static int parseAcks(String acksString) {try {return acksString.trim().equalsIgnoreCase("all") ? -1 : Integer.parseInt(acksString.trim());} catch (Numb…

关于统计时间切片标签的一些sql

------当天付费明细表 DROP TABLE IF EXISTS rpt.tmp_mm_rb_daily_ffmx; create table rpt.tmp_mm_rb_daily_ffmx as select a.* FROM (select c.feemsisdn, c.destmsisdn, c.day, c.price/1000 fee, c.contentid, dc.content_name, c.ordernumber, c.cdrtime, c.createtime, c…

hadoop 二次开发DatanodeWriteTimeout设置

int getDatanodeWriteTimeout(int numNodes) {return this.dfsClientConf.confTime > 0 ? this.dfsClientConf.confTime 5000 * numNodes : 0;}int getDatanodeReadTimeout(int numNodes) {return this.dfsClientConf.socketTimeout > 0 ? 5000 * numNodes this.dfsC…

聚类算法

假定样本集 D {X1&#xff0c; 的&#xff0c;…&#xff0c; Xm} 包含 m 个无标记样本&#xff0c; 每个样本 X (X1; X2;… ; Xn) 是一个 n 维特征向量&#xff0c;则聚类算法将样本 集 D 划分为 k 个不相交的簇 {Gl I l 1&#xff0c; 2;… &#xff0c;时&#xff0c;其中…

k-means均值向量

给定样本集 D {Xl) 的&#xff0c;… ,xm}, “k 均值” (k-means )算法针对聚类所 得簇划分 C {C1, C2,…, Ck} 最小化平方误差 ι ELL Ilx 一队IIL il EGi 其中队甘il LEGi X 是簇 q 的均值向量.在一定程度上 刻画了簇内样本围绕簇均值向量的紧密程度&#xff0c; E 值越小则…

学习向量量化

与 k 均值算法类似&#xff0c;“学习向量量化” (Learning Vector Quantization&#xff0c;简 称 LVQ)也是试图找到一组原型向量来刻画聚类结构&#xff0c; 但与一般聚类算法不同 的是&#xff0c; LVQ 假设数据样本带有类别标记&#xff0c;学习过程利用样本的这些监督信息…

k 近邻加权平均

k 近邻(k-Nearest Neighbor&#xff0c;简称 kNN)学习是一种常用的监督学习方法&#xff0c; 其工作机制非常简单: 给定测试样本?基于某种距离度量找出训练集中与其最 靠近的 k 个训练样本&#xff0c;然后基于这 k 个"邻居"的信息来进行预测. 通常&#xff0c; 在分…

k 近邻降维

k 近邻(k-Nearest Neighbor&#xff0c;简称 kNN)学习是一种常用的监督学习方法&#xff0c; 其工作机制非常简单: 给定测试样本?基于某种距离度量找出训练集中与其最 靠近的 k 个训练样本&#xff0c;然后基于这 k 个"邻居"的信息来进行预测. 通常&#xff0c; 在分…

维度建模工具

幵始维度建模工作前&#xff0c;项目组需要理解业务需求&#xff0c;以及作为基础的源数据的实际情况。 通过与、 Ik务代表交流来发现需求&#xff0c;用于理解他们的基于关键性能指标、竞争性商业问题、 决策制定过程、支持分析需求的目标。同时&#xff0c;数据实际情况可以通…

Cube和Grouping 和Rollup

增强的聚合 Cube和Grouping 和Rollup 这几个分析函数通常用于OLAP中&#xff0c;不能累加&#xff0c;而且需要根据不同维度上钻和下钻的指标统计&#xff0c;比如&#xff0c;分小时、天、月的UV数。 GROUPING SETS 在一个GROUP BY查询中&#xff0c;根据不同的维度组合进行聚…

常见维度建模错误

需要避免的常见维度建模错误 错误 10: 在事实表中放入文本属性 要从数据仓库事实表中 挑出这些文本属性&#xff0c;并将它们放入维度表中。 错误 9: 限制使用冗长的描述符以节省空间 维度表从几何上看总是比事实表小很多。 错误 8: 将层次划分为多个维度 以用户看来最自然最 有…

2020-09-21

columns has 234 elements while hbase.columns.mapping has 92 elements (counting the key if implicit)) 根本原因&#xff1a; 对于4000个字符&#xff0c;hive Metastore中SERDE_PARAMS表中PARAM_VALUE字段的字符限制是此问题的根本原因。 此限制可防止Hive创建具有高列数…

2020-09-23

insert into table ads_user_action_convert_day select ‘2019-02-10’, uv.day_count, ua.order_count, cast(ua.order_count/uv.day_count as decimal(10,2)) visitor2order_convert_ratio, ua.payment_count, cast(ua.payment_count/ua.order_count as decimal(10,2)) orde…