关于docker部署的clickhouse同步数据方案

一只会飞的鱼儿 6天前 ⋅ 15 阅读
ad

脚本如下:

#!/bin/bash
# sync_clickhouse_final.sh

echo "ClickHouse数据库同步 - 最终版本"
echo "====================================="

CONTAINER=$(docker ps --filter "name=clickhouse" --format "{{.ID}}")
if [ -z "$CONTAINER" ]; then
    echo "找不到ClickHouse容器"
    exit 1
fi

echo "容器ID: $CONTAINER"
echo "从: webfunny_cloud_db 到: webfunny_db_test2"
echo ""

# 1. 创建目标数据库
echo "1. 创建目标数据库..."
docker exec $CONTAINER clickhouse-client -q "CREATE DATABASE IF NOT EXISTS webfunny_db_test2"

# 2. 获取表列表
echo "2. 获取表列表..."
TABLES=$(docker exec $CONTAINER clickhouse-client -q "SHOW TABLES FROM webfunny_cloud_db")
TOTAL=$(echo "$TABLES" | wc -l)
echo "找到 $TOTAL 个表"
echo ""

# 3. 同步函数
sync_table() {
    local table=$1
    
    echo "处理表: $table"
    
    # 获取原始建表语句
    local create_sql
    create_sql=$(docker exec $CONTAINER clickhouse-client -q "SHOW CREATE TABLE webfunny_cloud_db.\`$table\`")
    
    if [ -z "$create_sql" ]; then
        echo "无法获取表结构"
        return 1
    fi
    
    # 修复转义字符
    create_sql=$(echo -e "$create_sql")
    
    # 替换数据库名
    create_sql=${create_sql//webfunny_cloud_db\./webfunny_db_test2.}
    
    # 执行建表
    echo "$create_sql" | docker exec -i $CONTAINER clickhouse-client
    
    if [ $? -eq 0 ]; then
        echo "表结构创建成功"
        
        # 检查数据量
        echo "  检查数据量..."
        local row_count
        row_count=$(docker exec $CONTAINER clickhouse-client -q "SELECT count() FROM webfunny_cloud_db.\`$table\`")
        
        if [ -z "$row_count" ] || [ "$row_count" -eq 0 ]; then
            echo "表为空,跳过数据同步"
            return 0
        fi
        
        echo "数据行数: $row_count"
        
        # 同步数据
        echo "  同步数据..."
        sync_table_data "$table" "$row_count"
        
        return $?
    else
        echo "创建表失败"
        return 1
    fi
}

# 同步数据函数
sync_table_data() {
    local table=$1
    local total_rows=$2
    
    # 方法1: 先设置参数,再执行插入
    echo "  设置优化参数..."
    docker exec $CONTAINER clickhouse-client -q "SET max_partitions_per_insert_block = 1000000"
    
    echo "  执行数据插入..."
    docker exec $CONTAINER clickhouse-client -q "
        INSERT INTO webfunny_db_test2.\`$table\` 
        SELECT * FROM webfunny_cloud_db.\`$table\`
    "
    
    if [ $? -eq 0 ]; then
        echo "数据同步成功"
        return 0
    fi
    
    echo "全量插入失败,尝试分批..."
    
    # 检查是否有createdAt字段
    local has_createdat
    has_createdat=$(docker exec $CONTAINER clickhouse-client -q "
        SELECT count() 
        FROM system.columns 
        WHERE database = 'webfunny_cloud_db' 
          AND table = '$table' 
          AND (name = 'createdAt' OR name = 'happenTime')
    " 2>/dev/null)
    
    if [ "$has_createdat" -gt 0 ]; then
        echo "  使用createdAt字段分批..."
        sync_by_createdat "$table"
    else
        echo "  使用LIMIT分批..."
        sync_by_limit "$table" "$total_rows"
    fi
}

# 按createdAt分批
sync_by_createdat() {
    local table=$1
    
    # 获取日期范围
    echo "  获取日期范围..."
    local date_range
    date_range=$(docker exec $CONTAINER clickhouse-client -q "
        SELECT 
            toDate(min(createdAt)) as min_date,
            toDate(max(createdAt)) as max_date
        FROM webfunny_cloud_db.\`$table\`
    " 2>/dev/null)
    
    if [ $? -ne 0 ]; then
        # 尝试happenTime字段
        date_range=$(docker exec $CONTAINER clickhouse-client -q "
            SELECT 
                toDate(min(happenTime)) as min_date,
                toDate(max(happenTime)) as max_date
            FROM webfunny_cloud_db.\`$table\`
        " 2>/dev/null)
    fi
    
    local min_date=$(echo "$date_range" | awk 'NR==1{print $1}' 2>/dev/null)
    local max_date=$(echo "$date_range" | awk 'NR==1{print $2}' 2>/dev/null)
    
    if [ -z "$min_date" ] || [ "$min_date" = "0000-00-00" ] || [ "$min_date" = "NULL" ]; then
        echo "无法获取日期范围,使用LIMIT分批"
        return 1
    fi
    
    echo "  日期范围: $min_date 到 $max_date"
    
    # 设置参数
    docker exec $CONTAINER clickhouse-client -q "SET max_partitions_per_insert_block = 1000000"
    
    # 按天分批同步
    local current_date="$min_date"
    local batch_count=0
    local success_count=0
    
    while [ "$(date -d "$current_date" +%s 2>/dev/null || echo 0)" -le "$(date -d "$max_date" +%s 2>/dev/null || echo 0)" ]; do
        ((batch_count++))
        echo "  批次 $batch_count: $current_date"
        
        # 先检查这一天是否有数据
        local day_count
        day_count=$(docker exec $CONTAINER clickhouse-client -q "
            SELECT count() 
            FROM webfunny_cloud_db.\`$table\`
            WHERE toDate(createdAt) = '$current_date'
        " 2>/dev/null)
        
        if [ "$day_count" -eq 0 ]; then
            # 尝试happenTime
            day_count=$(docker exec $CONTAINER clickhouse-client -q "
                SELECT count() 
                FROM webfunny_cloud_db.\`$table\`
                WHERE toDate(happenTime) = '$current_date'
            " 2>/dev/null)
        fi
        
        if [ -n "$day_count" ] && [ "$day_count" -gt 0 ]; then
            echo "    当天数据量: $day_count 行"
            
            # 使用createdAt字段
            docker exec $CONTAINER clickhouse-client -q "
                INSERT INTO webfunny_db_test2.\`$table\`
                SELECT * FROM webfunny_cloud_db.\`$table\`
                WHERE toDate(createdAt) = '$current_date'
            " 2>&1
            
            if [ $? -ne 0 ]; then
                # 尝试happenTime字段
                docker exec $CONTAINER clickhouse-client -q "
                    INSERT INTO webfunny_db_test2.\`$table\`
                    SELECT * FROM webfunny_cloud_db.\`$table\`
                    WHERE toDate(happenTime) = '$current_date'
                " 2>&1
            fi
            
            if [ $? -eq 0 ]; then
                ((success_count++))
                echo "批次 $current_date 同步成功"
            else
                echo "批次 $current_date 同步失败"
            fi
        else
            echo "    当天无数据,跳过"
        fi
        
        # 下一天
        current_date=$(date -d "$current_date + 1 day" +%Y-%m-%d 2>/dev/null)
        if [ $? -ne 0 ]; then
            break
        fi
    done
    
    if [ $success_count -gt 0 ]; then
        echo "按日期分批同步完成 ($success_count 天数据成功)"
        return 0
    else
        return 1
    fi
}

# 使用LIMIT分批
sync_by_limit() {
    local table=$1
    local total_rows=$2
    
    # 设置参数
    docker exec $CONTAINER clickhouse-client -q "SET max_partitions_per_insert_block = 1000000"
    
    local batch_size=50000
    local offset=0
    local batch_count=0
    local success_count=0
    
    echo "  使用LIMIT OFFSET分批,批次大小: $batch_size"
    
    while [ $offset -lt $total_rows ]; do
        ((batch_count++))
        echo "  批次 $batch_count: 行 $((offset+1)) - $((offset+batch_size))"
        
        docker exec $CONTAINER clickhouse-client -q "
            INSERT INTO webfunny_db_test2.\`$table\`
            SELECT * FROM webfunny_cloud_db.\`$table\`
            LIMIT $batch_size OFFSET $offset
        " 2>&1
        
        if [ $? -eq 0 ]; then
            ((success_count++))
            echo "批次 $batch_count 同步成功"
        else
            echo "批次 $batch_count 同步失败"
            
            # 如果失败,尝试更小的批次
            if [ $batch_size -gt 10000 ]; then
                echo "    尝试更小的批次大小: 10000"
                batch_size=10000
                continue
            fi
        fi
        
        offset=$((offset + batch_size))
    done
    
    if [ $success_count -gt 0 ]; then
        echo "分批同步完成 ($success_count/$batch_count 批成功)"
        return 0
    else
        return 1
    fi
}

# 4. 专门处理BuryPointFailLog表
echo "3. 专门处理BuryPointFailLog表..."
echo ""

# 先单独处理这个有问题的表
TABLE="BuryPointFailLog"
if echo "$TABLES" | grep -q "^${TABLE}$"; then
    echo "处理表: $TABLE"
    
    # 创建表结构
    create_sql=$(docker exec $CONTAINER clickhouse-client -q "SHOW CREATE TABLE webfunny_cloud_db.\`$TABLE\`")
    create_sql=$(echo -e "$create_sql")
    create_sql=${create_sql//webfunny_cloud_db\./webfunny_db_test2.}
    echo "$create_sql" | docker exec -i $CONTAINER clickhouse-client
    
    if [ $? -eq 0 ]; then
        echo "表结构创建成功"
        
        # 获取行数
        row_count=$(docker exec $CONTAINER clickhouse-client -q "SELECT count() FROM webfunny_cloud_db.\`$TABLE\`")
        echo "数据行数: $row_count"
        
        # 使用特殊方法同步
        echo "  使用优化方法同步..."
        docker exec $CONTAINER clickhouse-client -q "SET max_partitions_per_insert_block = 1000000"
        
        # 尝试按月分批
        echo "  尝试按月分批..."
        
        # 获取月份列表
        months=$(docker exec $CONTAINER clickhouse-client -q "
            SELECT DISTINCT toYYYYMM(createdAt) as month
            FROM webfunny_cloud_db.\`$TABLE\`
            ORDER BY month
        " 2>/dev/null)
        
        if [ -n "$months" ]; then
            success=0
            for month in $months; do
                echo "  同步月份: $month"
                
                docker exec $CONTAINER clickhouse-client -q "
                    INSERT INTO webfunny_db_test2.\`$TABLE\`
                    SELECT * FROM webfunny_cloud_db.\`$TABLE\`
                    WHERE toYYYYMM(createdAt) = $month
                " 2>&1
                
                if [ $? -eq 0 ]; then
                    ((success++))
                    echo "月份 $month 同步成功"
                else
                    echo "月份 $month 同步失败"
                fi
            done
            
            if [ $success -gt 0 ]; then
                echo "按月分批同步完成 ($success 个月成功)"
            else
                echo "按月分批失败,尝试按天分批..."
                sync_by_createdat "$TABLE"
            fi
        else
            echo "无法获取月份列表,使用按天分批..."
            sync_by_createdat "$TABLE"
        fi
    else
        echo "创建表失败"
    fi
else
    echo "表 $TABLE 不存在"
fi

echo ""
echo "测试完成,是否继续同步所有表?(y/N)"
read -r CONTINUE

if [[ "$CONTINUE" != "y" && "$CONTINUE" != "Y" ]]; then
    echo "退出"
    exit 0
fi

# 5. 同步剩余表
echo ""
echo "4. 开始同步剩余表..."
echo ""

COUNT=1
PROCESSED=0
SUCCESS=0
FAIL=0
FAILED_TABLES=()

for TABLE in $TABLES; do
    # 跳过已处理的BuryPointFailLog
    if [ "$TABLE" = "BuryPointFailLog" ]; then
        continue
    fi
    
    echo "[$COUNT/$TOTAL] 同步: $TABLE"
    ((PROCESSED++))
    
    if sync_table "$TABLE"; then
        ((SUCCESS++))
    else
        ((FAIL++))
        FAILED_TABLES+=("$TABLE")
    fi
    
    echo ""
    ((COUNT++))
done

# 6. 输出结果
echo "====================================="
echo "同步结果"
echo "总表数: $TOTAL"
echo "已处理: $PROCESSED"
echo "成功: $SUCCESS"
echo "失败: $FAIL"
echo ""

if [ $FAIL -gt 0 ]; then
    echo "失败的表 ($FAIL 个):"
    for failed_table in "${FAILED_TABLES[@]}"; do
        echo "  - $failed_table"
    done
else
    echo "所有表同步成功!"
fi

echo ""
echo "同步完成!时间: $(date)"

 

目前遇到的问题是

1、分区太多问题:

Received exception from server (version 24.3.2):
Code: 252. DB::Exception: Received from localhost:9000. DB::Exception: Too many partitions for single INSERT block (more than 100). The limit is controlled by 'max_partitions_per_insert_block' setting. Large number of partitions is a common misconception. It will lead to severe negative performance impact, including slow server startup, slow INSERT queries and slow SELECT queries. Recommended total number of partitions for a table is under 1000..10000. Please note, that partitioning is not intended to speed up SELECT queries (ORDER BY key is sufficient to make range queries fast). Partitions are intended for data manipulation (DROP PARTITION, etc).. (TOO_MANY_PARTS)

处理方式是分批次插入数据

2、在于SET语句的语法。ClickHouse不允许在单个查询中执行多个语句(如 SET ...; INSERT ...)。我们需要分开执行

# 使用LIMIT分批
sync_by_limit() {
    local table=$1
    local total_rows=$2
    
    local batch_size=100000
    local offset=0
    local batch_count=0
    local success_count=0
    
    echo "  使用LIMIT OFFSET分批,批次大小: $batch_size"
    
    while [ $offset -lt $total_rows ]; do
        ((batch_count++))
        echo "  批次 $batch_count: 行 $((offset+1)) - $((offset+batch_size))"
        
        docker exec "$CONTAINER" clickhouse-client -q "
            SET max_partitions_per_insert_block = 1000000;
            INSERT INTO webfunny_db_test2.\`$table\`
            SELECT * FROM webfunny_cloud_db.\`$table\`
            LIMIT $batch_size OFFSET $offset
        " 2>&1
        
        if [ $? -eq 0 ]; then
            ((success_count++))
            echo "    批次 $batch_count 同步成功"
        else
            echo " 批次 $batch_count 同步失败"
        fi
        
        offset=$((offset + batch_size))
    done
    
    if [ $success_count -gt 0 ]; then
        echo " 分批同步完成 ($success_count/$batch_count 批成功)"
        return 0
    else
        return 1
    fi
}

 

 

 

 

 

关于Webfunny

Webfunny专注于前端监控系统,前端埋点系统的研发。 致力于帮助开发者快速定位问题,帮助企业用数据驱动业务,实现业务数据的快速增长。支持H5/Web/PC前端、微信小程序、支付宝小程序、UniApp和Taro等跨平台框架。实时监控前端网页、前端数据分析、错误统计分析监控和BUG预警,第一时间报警,快速修复BUG!支持私有化部署,Docker容器化部署,可支持千万级PV的日活量!

  点赞 0   收藏 0
  • 一只会飞的鱼儿
    共发布61篇文章 获得8个收藏
全部评论: 0