插件配置的Java解析逻辑
./bin/chunjun-standalone.sh -job my-examples/sync_mysql2mysql.json -confProp { "chunjun.dirty-data.output-type" : "print" , "chunjun.dirty-data.max-rows" : "1000" , "chunjun.dirty-data.max-collect-failed-rows" : "100" , "chunjun.dirty-data.jdbc.url" : "jdbc:mysql://localhost:3306/tiezhu" , "chunjun.dirty-data.jdbc.username" : "root" , "chunjun.dirty-data.jdbc.password" : "abc123" , "chunjun.dirty-data.jdbc.database" : "tiezhu" , "chunjun.dirty-data.jdbc.table" : "chunjun_dirty_data" ,"chunjun.dirty-data.jdbc.batch-size" : "10" , "chunjun.dirty-data.log.print-interval" : "10" }
脏数据插件的配置信息会在启动时由-confProp传入 传入后,会在Main.java中由configStreamExecutionEnvironment方法处理
package com. dtstack. chunjun ;
public class Main {
private static void configStreamExecutionEnvironment (
StreamExecutionEnvironment env, Options options, SyncConf config) {
if ( config != null ) {
} else {
DirtyConf dirtyConf = DirtyConfUtil . parse ( options) ;
}
}
}
package com. dtstack. chunjun ;
public class DirtyConfUtil {
private static final String DEFAULT_TYPE = "default" ;
public static final String DIRTY_CONF_PREFIX = "chunjun.dirty-data." ;
public static final String TYPE_KEY = "chunjun.dirty-data.output-type" ;
public static final String MAX_ROWS_KEY = "chunjun.dirty-data.max-rows" ;
public static final String MAX_FAILED_ROWS_KEY = "chunjun.dirty-data.max-collect-failed-rows" ;
public static final String PRINT_INTERVAL = "chunjun.dirty-data.log.print-interval" ;
public static final String DIRTY_DIR = "chunjun.dirty-data.dir" ;
public static final String DIRTY_DIR_SUFFIX = "dirty-data-collector" ;
public static DirtyConf parseFromMap ( Map < String , String > confMap) {
DirtyConf dirtyConf = new DirtyConf ( ) ;
Properties pluginProperties = new Properties ( ) ;
String type = String . valueOf ( confMap. getOrDefault ( TYPE_KEY, DEFAULT_TYPE) ) ;
if ( type. equals ( "jdbc" ) ) {
type = "mysql" ;
}
long maxConsumed = Long . parseLong ( String . valueOf ( confMap. getOrDefault ( MAX_ROWS_KEY, "0" ) ) ) ;
long maxFailed =
Long . parseLong ( String . valueOf ( confMap. getOrDefault ( MAX_FAILED_ROWS_KEY, "0" ) ) ) ;
long printRate = Long . parseLong ( String . valueOf ( confMap. getOrDefault ( PRINT_INTERVAL, "1" ) ) ) ;
String pluginDir = MapUtils . getString ( confMap, DIRTY_DIR) ;
confMap. entrySet ( ) . stream ( )
. filter (
item ->
item. getKey ( )
. toLowerCase ( Locale . ROOT)
. startsWith ( DIRTY_CONF_PREFIX) )
. forEach (
item ->
pluginProperties. put (
item. getKey ( )
. toLowerCase ( Locale . ROOT)
. replaceFirst ( DIRTY_CONF_PREFIX, "" )
. trim ( ) ,
item. getValue ( ) ) ) ;
dirtyConf. setType ( type) ;
dirtyConf. setMaxConsumed ( maxConsumed < 0 ? Long . MAX_VALUE : maxConsumed) ;
dirtyConf. setMaxFailedConsumed ( maxFailed < 0 ? Long . MAX_VALUE : maxFailed) ;
dirtyConf. setPrintRate ( printRate <= 0 ? Long . MAX_VALUE : printRate) ;
dirtyConf. setPluginProperties ( pluginProperties) ;
dirtyConf. setLocalPluginPath ( pluginDir) ;
return dirtyConf;
}
public static DirtyConf parse ( Options options) {
try {
Properties properties = PropertiesUtil . parseConf ( options. getConfProp ( ) ) ;
properties. put (
DIRTY_DIR, options. getChunjunDistDir ( ) + File . separator + DIRTY_DIR_SUFFIX) ;
return parse ( properties) ;
} catch ( Exception e) {
throw new NoRestartException (
String . format ( "Parse conf [%s] to DirtyConf failed." , options. getConfProp ( ) ) ,
e) ;
}
}
public static DirtyConf parse ( Properties properties) {
try {
Map < String , String > confMap = Maps . fromProperties ( properties) ;
return parseFromMap ( confMap) ;
} catch ( Exception e) {
throw new NoRestartException (
String . format (
"Parse properties to dirtyConf failed. Properties: %s" , properties) ,
e) ;
}
}
}
其中DirtyConfUtil.parse(Options options)从options中获取了我们前面传入的confProp,然后依次调用了DirtyConfUtil.parse(Options options)、DirtyConfUtil.parseFromMap(Map<String, String> confMap),完成解析,最终生成了DirtyConf实体对象
package com. dtstack. chunjun. dirty ;
public class DirtyConf implements Serializable {
private static final long serialVersionUID = 1L ;
protected long maxConsumed;
protected long maxFailedConsumed;
private String type;
private Long printRate = 1L ;
private Properties pluginProperties = new Properties ( ) ;
private String localPluginPath;
}
脏数据收集逻辑
package com. dtstack. chunjun. source. format ;
public abstract class BaseRichInputFormat extends RichInputFormat < RowData , InputSplit > {
@Override
public RowData nextRecord ( RowData rowData) {
if ( byteRateLimiter != null ) {
byteRateLimiter. acquire ( ) ;
}
RowData internalRow = null ;
try {
internalRow = nextRecordInternal ( rowData) ;
} catch ( ReadRecordException e) {
dirtyManager. collect ( e. getRowData ( ) , e, null ) ;
}
if ( internalRow != null ) {
updateDuration ( ) ;
if ( numReadCounter != null ) {
numReadCounter. add ( 1 ) ;
}
if ( bytesReadCounter != null ) {
bytesReadCounter. add ( rowSizeCalculator. getObjectSize ( internalRow) ) ;
}
}
return internalRow;
}
}
对于每条数据的处理,会调用nextRecordInternal(rowData),对于不同的数据源会使用不同的BaseRichInputFormat的子实现来处理 每条数据的处理过程中,可能会出错,所以使用了try catch的方式来捕获,而此处catch中则是实现了对于脏数据的捕获收集
package com. dtstack. chunjun. dirty. manager ;
public class DirtyManager implements Serializable {
public void collect ( Object data, Throwable cause, String field) {
if ( executor == null ) {
execute ( ) ;
}
DirtyDataEntry entity = new DirtyDataEntry ( ) ;
entity. setJobId ( jobId) ;
entity. setJobName ( jobName) ;
entity. setOperatorName ( operationName) ;
entity. setCreateTime ( new Timestamp ( System . currentTimeMillis ( ) ) ) ;
entity. setDirtyContent ( toString ( data) ) ;
entity. setFieldName ( field) ;
entity. setErrorMessage ( ExceptionUtil . getErrorMessage ( cause) ) ;
consumer. offer ( entity) ;
errorCounter. add ( 1L ) ;
}
}
此处的DirtyDataEntry是用于记录脏数据信息的实体对象,consumer.offer(entity)将脏数据对象提交到了队列,errorCounter.add(1L)记录了脏数据的条数 再看consumer对应的class DirtyDataCollector
package com. dtstack. chunjun. dirty. consumer ;
public abstract class DirtyDataCollector implements Runnable , Serializable {
protected final LongCounter failedConsumedCounter = new LongCounter ( 0L ) ;
protected final LongCounter consumedCounter = new LongCounter ( 0L ) ;
protected LinkedBlockingQueue < DirtyDataEntry > consumeQueue = new LinkedBlockingQueue < > ( ) ;
public synchronized void offer ( DirtyDataEntry dirty) {
consumeQueue. offer ( dirty) ;
addConsumed ( 1L ) ;
}
@Override
public void run ( ) {
while ( isRunning. get ( ) ) {
try {
DirtyDataEntry dirty = consumeQueue. take ( ) ;
consume ( dirty) ;
} catch ( Exception e) {
addFailedConsumed ( e, 1L ) ;
}
}
}
protected void addConsumed ( long count) {
consumedCounter. add ( count) ;
if ( consumedCounter. getLocalValue ( ) >= maxConsumed) {
throw new NoRestartException (
String . format (
"The dirty consumer shutdown, due to the consumed count exceed the max-consumed [%s]" ,
maxConsumed) ) ;
}
}
protected void addFailedConsumed ( Throwable cause, long failedCount) {
failedConsumedCounter. add ( failedCount) ;
warn (
LOG,
"dirty-plugins consume failed." ,
cause,
printRate,
failedConsumedCounter. getLocalValue ( ) ) ;
if ( failedConsumedCounter. getLocalValue ( ) >= maxFailedConsumed) {
throw new NoRestartException (
String . format (
"The dirty consumer shutdown, due to the failed-consumed count exceed the max-failed-consumed [%s]" ,
maxFailedConsumed) ) ;
}
}
}
当前面调用consumer.offer(entity)时,脏数据对象被提交进了此处的队列consumeQueue 而DirtyDataCollector实现了Runnable接口,里面有个run方法,该方法内是个while循环,会不断的读取consumeQueue队列中的数据做处理
关于DirtyDataCollector的启动逻辑,可以看后文 前面调用offer时,同时还调用了addConsumed(1L),此时会将Flink的累加器consumedCounter加1,并判断是否大于了最大值maxConsumed(maxConsumed即是我们传入的配置chunjun.dirty-data.max-rows)
Flink的累加器只有在整个应用执行结束后,合并所有节点的累加器的和才是最终的结果(consumedCounter.getLocalValue()只能获取节点本地的累加器的值) 因此,ChunJun此处的实现在1个channel下没问题,当出现多个channel时,每个channel是单独计算的 写入Sink端和读取Source端的脏数据处理逻辑一致,不再赘述
脏数据记录写入MySQL
接着再看DirtyDataCollector中的run
@Override
public void run ( ) {
while ( isRunning. get ( ) ) {
try {
DirtyDataEntry dirty = consumeQueue. take ( ) ;
consume ( dirty) ;
} catch ( Exception e) {
addFailedConsumed ( e, 1L ) ;
}
}
}
先从队列中取出数据,再调用consume,我们来看其对应MySQL的实现
package com. dtstack. chunjun. dirty. mysql ;
public class MysqlDirtyDataCollector extends DirtyDataCollector {
private void flush ( ) {
try {
for ( DirtyDataEntry item : entities) {
final String [ ] dirtyArrays = item. toArray ( ) ;
for ( int i = 0 ; i < TABLE_FIELDS. length; i++ ) {
statement. setObject ( i + 1 , dirtyArrays[ i] ) ;
}
statement. addBatch ( ) ;
}
statement. executeBatch ( ) ;
} catch ( SQLException e) {
singleFlush ( ) ;
} finally {
entities. clear ( ) ;
}
}
@Override
protected void consume ( DirtyDataEntry dirty) throws Exception {
entities. add ( dirty) ;
if ( consumedCounter. getLocalValue ( ) % batchSize == 0 ) {
flush ( ) ;
}
}
}
此处先将脏数据对象放入了entities列表中,然后看Flink累加器consumedCounter中记录的值是否是batchSize的整数倍(batchSize对应传入的配置chunjun.dirty-data.jdbc.batch-size),如果是,那么调用flush(),利用JDBC的批量写入机制,将数据入库到配置的MySQL中
此处取余的逻辑比较奇怪(即consumedCounter.getLocalValue() % batchSize == 0),那么是不是只要consumedCounter的值不为batchSize的整数倍,就永远不会执行flush()逻辑呢? 因为此处判断的逻辑和consumedCounter累加的逻辑不在同一线程,所以记录数只会在恰巧的情况下达到batchSize的整数倍且被处理(当脏数据较多时,entities中可能会堆积大量数据,而数据库里也一直看不到记录) 建议:根据entities的大小和时间间隔来决定是否应该调用flush() 而DirtyDataCollector中的run中还有一个try catch,当前面记录脏数据失败时,则会调用addFailedConsumed(e, 1L)
package com. dtstack. chunjun. dirty. consumer ;
public abstract class DirtyDataCollector implements Runnable , Serializable {
protected final LongCounter failedConsumedCounter = new LongCounter ( 0L ) ;
protected void addFailedConsumed ( Throwable cause, long failedCount) {
failedConsumedCounter. add ( failedCount) ;
warn (
LOG,
"dirty-plugins consume failed." ,
cause,
printRate,
failedConsumedCounter. getLocalValue ( ) ) ;
if ( failedConsumedCounter. getLocalValue ( ) >= maxFailedConsumed) {
throw new NoRestartException (
String . format (
"The dirty consumer shutdown, due to the failed-consumed count exceed the max-failed-consumed [%s]" ,
maxFailedConsumed) ) ;
}
}
}
此时将另一个Flink累加器failedConsumedCounter加1,并判断是否超过了最大值maxFailedConsumed(maxFailedConsumed对应传入的配置chunjun.dirty-data.max-collect-failed-rows)。
需要注意的是,每次记录脏数据到MySQL,是批量写入的,而每次写入异常时累加器只会加1,所以配置chunjun.dirty-data.max-collect-failed-rows对应的是脏数据写入次数,而不是条数
errorLimit配置不生效
"setting" : {
"speed" : {
"channel" : 1 ,
"bytes" : 0
} ,
"errorLimit" : {
"record" : 5 ,
"percentage" : 60.0
}
}
package com. dtstack. chunjun. conf ;
public class SettingConf implements Serializable {
private static final long serialVersionUID = 1L ;
private SpeedConf speed = new SpeedConf ( ) ;
private MetricPluginConf metricPluginConf = new MetricPluginConf ( ) ;
private RestoreConf restore = new RestoreConf ( ) ;
private RestartConf restart = new RestartConf ( ) ;
private LogConf log = new LogConf ( ) ;
}
DirtyDataCollector的实例化、运行逻辑
在BaseRichInputFormat中处理数据源的数据发送异常时,会调用dirtyManager.collect(e.getRowData(), e, null) collect方法中最开始调用了execute()
package com. dtstack. chunjun. dirty. manager ;
public class DirtyManager implements Serializable {
private transient ThreadPoolExecutor executor;
public DirtyManager ( DirtyConf dirtyConf, RuntimeContext runtimeContext) {
this . consumer = DataSyncFactoryUtil . discoverDirty ( dirtyConf) ;
}
public void collect ( Object data, Throwable cause, String field) {
if ( executor == null ) {
execute ( ) ;
}
}
public void execute ( ) {
if ( executor == null ) {
executor =
new ThreadPoolExecutor (
MAX_THREAD_POOL_SIZE,
MAX_THREAD_POOL_SIZE,
0 ,
TimeUnit . MILLISECONDS,
new LinkedBlockingQueue < > ( ) ,
new ChunJunThreadFactory (
"dirty-consumer" ,
true ,
( t, e) -> {
LOG. error (
String . format (
"Thread [%s] consume failed." , t. getName ( ) ) ,
e) ;
} ) ,
new ThreadPoolExecutor. CallerRunsPolicy ( ) ) ;
}
consumer. open ( ) ;
executor. execute ( consumer) ;
}
}
此处构建了一个线程池executor,并对consumer进行了执行(记得前面说这个consumer实现了Runnable接口吧) 而consumer的实例化,则是在DirtyManager得构造器中,调用了DataSyncFactoryUtil.discoverDirty(dirtyConf),利用反射获得脏数据插件对应的Class,并实例化(和connector插件几乎一致)
package com. dtstack. chunjun. util ;
public class DataSyncFactoryUtil {
public static DirtyDataCollector discoverDirty ( DirtyConf conf) {
try {
String pluginName = conf. getType ( ) ;
String pluginClassName = PluginUtil . getPluginClassName ( pluginName, OperatorType . dirty) ;
if ( pluginName. equals ( DEFAULT_DIRTY_TYPE) ) {
pluginClassName = DEFAULT_DIRTY_CLASS;
}
ClassLoader classLoader = Thread . currentThread ( ) . getContextClassLoader ( ) ;
Class < ? > clazz = classLoader. loadClass ( pluginClassName) ;
Constructor < ? > constructor = clazz. getConstructor ( ) ;
final DirtyDataCollector consumer = ( DirtyDataCollector ) constructor. newInstance ( ) ;
consumer. initializeConsumer ( conf) ;
return consumer;
} catch ( Exception e) {
throw new NoRestartException ( "Load dirty plugins failed!" , e) ;
}
}
}