Clickhouse 配置参考

Clickhouse 配置参考

适用版本 21.3.9.84

config.xml 配置

xml 复制代码
<?xml version="1.0"?>
<!--
  NOTE: User and query level settings are set up in "users.xml" file.
-->
<yandex>
    <access_control_path>/data/clickhouse/clickhouse-server/access/</access_control_path>
    <logger>
        <!-- Possible levels: https://github.com/pocoproject/poco/blob/poco-1.9.4-release/Foundation/include/Poco/Logger.h#L105 -->
        <level>trace</level>
        <log>/data/clickhouse/clickhouse-server/logs/clickhouse-server.log</log>
        <errorlog>/data/clickhouse/clickhouse-server/logs/clickhouse-server.err.log</errorlog>
        <size>1000M</size>
        <count>10</count>
        <!-- <console>1</console> -->
        <!-- Default behavior is autodetection (log to console if not daemon mode and is tty) -->
    </logger>
    <!--display_name>production</display_name-->
    <!-- It is the name that will be shown in the client -->
    <http_port>8123</http_port>
    <tcp_port>9000</tcp_port>
    <mysql_port>9004</mysql_port>
    <!-- For HTTPS and SSL over native protocol. -->
    <!--
    <https_port>8443</https_port>
    <tcp_port_secure>9440</tcp_port_secure>
    -->
    <!-- Port for communication between replicas. Used for data exchange. -->
    <interserver_http_port>9009</interserver_http_port>
    <!-- Hostname that is used by other replicas to request this server.
         If not specified, than it is determined analoguous to 'hostname -f' command.
         This setting could be used to switch replication to another network interface.
      -->
    <!--
    <interserver_http_host>example.yandex.ru</interserver_http_host>
    -->
    <!-- Listen specified host. use :: (wildcard IPv6 address), if you want to accept connections both with IPv4 and IPv6 from everywhere. -->
    <!-- <listen_host>::</listen_host> -->
    <!-- Same for hosts with disabled ipv6: -->
    <!-- <listen_host>0.0.0.0</listen_host> -->
    <!-- Default values - try listen localhost on ipv4 and ipv6: -->
    <!--
    <listen_host>::1</listen_host>
    -->
    <!-- Don't exit if ipv6 or ipv4 unavailable, but listen_host with this protocol specified -->
    <!-- <listen_try>0</listen_try> -->
    <!-- Allow listen on same address:port -->
    <!-- <listen_reuse_port>0</listen_reuse_port> -->
    <!-- <listen_backlog>64</listen_backlog> -->
    <max_connections>4096</max_connections>
    <keep_alive_timeout>120</keep_alive_timeout>
    <!-- Maximum number of concurrent queries. -->
    <max_concurrent_queries>100</max_concurrent_queries>
    <!-- Set limit on number of open files (default: maximum). This setting makes sense on Mac OS X because getrlimit() fails to retrieve
         correct maximum value. -->
    <!-- <max_open_files>262144</max_open_files> -->
    <!-- Size of cache of uncompressed blocks of data, used in tables of MergeTree family.
         In bytes. Cache is single for server. Memory is allocated only on demand.
         Cache is used when 'use_uncompressed_cache' user setting turned on (off by default).
         Uncompressed cache is advantageous only for very short queries and in rare cases.
      -->
    <uncompressed_cache_size>8589934592</uncompressed_cache_size>
    <!-- Approximate size of mark cache, used in tables of MergeTree family.
         In bytes. Cache is single for server. Memory is allocated only on demand.
         You should not lower this value.
      -->
    <mark_cache_size>5368709120</mark_cache_size>
    <!-- Path to data directory, with trailing slash. -->
    <path>/data/clickhouse/clickhouse-server/</path>
    <!-- Path to temporary data for processing hard queries. -->
    <tmp_path>/data/clickhouse/clickhouse-server/tmp/</tmp_path>
    <!-- Policy from the <storage_configuration> for the temporary files.
         If not set <tmp_path> is used, otherwise <tmp_path> is ignored.

         Notes:
         - move_factor              is ignored
         - keep_free_space_bytes    is ignored
         - max_data_part_size_bytes is ignored
         - you must have exactly one volume in that policy
    -->
    <!-- <tmp_policy>tmp</tmp_policy> -->
    <storage_configuration>
        <disks>
            <default>
                <keep_free_space_bytes>10737418240</keep_free_space_bytes>
            </default>
        </disks>
    </storage_configuration>
    <!-- Directory with user provided files that are accessible by 'file' table function. -->
    <user_files_path>/data/clickhouse/clickhouse-server/user_files/</user_files_path>
    <!-- Path to configuration file with users, access rights, profiles of settings, quotas. -->
    <users_config>users.xml</users_config>
    <!-- Default profile of settings. -->
    <default_profile>default</default_profile>
    <!-- System profile of settings. This settings are used by internal processes (Buffer storage, Distibuted DDL worker and so on). -->
    <!-- <system_profile>default</system_profile> -->
    <!-- Default database. -->
    <default_database>default</default_database>
    <!-- Server time zone could be set here.

         Time zone is used when converting between String and DateTime types,
          when printing DateTime in text formats and parsing DateTime from text,
          it is used in date and time related functions, if specific time zone was not passed as an argument.

         Time zone is specified as identifier from IANA time zone database, like UTC or Africa/Abidjan.
         If not specified, system time zone at server startup is used.

         Please note, that server could display time zone alias instead of specified name.
         Example: W-SU is an alias for Europe/Moscow and Zulu is an alias for UTC.
    -->
    <timezone>Asia/Shanghai</timezone>
    <!-- You can specify umask here (see "man umask"). Server will apply it on startup.
         Number is always parsed as octal. Default umask is 027 (other users cannot read logs, data files, etc; group can only read).
    -->
    <!-- <umask>022</umask> -->
    <!-- Perform mlockall after startup to lower first queries latency
          and to prevent clickhouse executable from being paged out under high IO load.
         Enabling this option is recommended but will lead to increased startup time for up to a few seconds.
    -->
    <mlock_executable>true</mlock_executable>
    <!-- Configuration of clusters that could be used in Distributed tables.
         https://clickhouse.tech/docs/en/operations/table_engines/distributed/
      -->
    <remote_servers incl="clickhouse_remote_servers"/>
    <zookeeper incl="zookeeper-servers" optional="true"/>
    <!-- Substitutions for parameters of replicated tables.
          Optional. If you don't use replicated tables, you could omit that.

         See https://clickhouse.yandex/docs/en/table_engines/replication/#creating-replicated-tables
      -->
    <macros incl="macros" optional="true"/>
    <!-- Reloading interval for embedded dictionaries, in seconds. Default: 3600. -->
    <builtin_dictionaries_reload_interval>3600</builtin_dictionaries_reload_interval>
    <!-- Maximum session timeout, in seconds. Default: 3600. -->
    <max_session_timeout>3600</max_session_timeout>
    <!-- Default session timeout, in seconds. Default: 60. -->
    <default_session_timeout>60</default_session_timeout>
    <!-- Serve endpoint fot Prometheus monitoring. -->
    <!--
NaN        port - port to setup server. If not defined or 0 than http_port used
        metrics - send data from table system.metrics
        events - send data from table system.events
        asynchronous_metrics - send data from table system.asynchronous_metrics
    -->
    <prometheus>
        <endpoint>/metrics</endpoint>
        <port>9363</port>
        <metrics>true</metrics>
        <events>true</events>
        <asynchronous_metrics>true</asynchronous_metrics>
    </prometheus>
    <!-- Query log. Used only for queries with setting log_queries = 1. -->
    <query_log>
        <!-- What table to insert data. If table is not exist, it will be created.
             When query log structure is changed after system update,
              then old table will be renamed and new table will be created automatically.
        -->
        <database>system</database>
        <table>query_log</table>
        <!--
            PARTITION BY expr https://clickhouse.yandex/docs/en/table_engines/custom_partitioning_key/
            Example:
                event_date
                toMonday(event_date)
                toYYYYMM(event_date)
                toStartOfHour(event_time)
        -->
        <partition_by>toYYYYMM(event_date)</partition_by>
        <!-- Instead of partition_by, you can provide full engine expression (starting with ENGINE = ) with parameters,
             Example: <engine>ENGINE = MergeTree PARTITION BY toYYYYMM(event_date) ORDER BY (event_date, event_time) SETTINGS index_granularity = 1024</engine>
          -->
        <!-- Interval of flushing data. -->
        <flush_interval_milliseconds>7500</flush_interval_milliseconds>
        <ttl>event_date + INTERVAL 30 DAY DELETE</ttl>
    </query_log>
    <!-- Trace log. Stores stack traces collected by query profilers.
         See query_profiler_real_time_period_ns and query_profiler_cpu_time_period_ns settings. -->
    <trace_log>
        <database>system</database>
        <table>trace_log</table>
        <partition_by>toYYYYMM(event_date)</partition_by>
        <flush_interval_milliseconds>7500</flush_interval_milliseconds>
        <ttl>event_date + INTERVAL 30 DAY DELETE</ttl>
    </trace_log>
    <!-- Query thread log. Has information about all threads participated in query execution.
         Used only for queries with setting log_query_threads = 1. -->
    <query_thread_log>
        <database>system</database>
        <table>query_thread_log</table>
        <partition_by>toYYYYMM(event_date)</partition_by>
        <flush_interval_milliseconds>7500</flush_interval_milliseconds>
        <ttl>event_date + INTERVAL 30 DAY DELETE</ttl>
    </query_thread_log>
    <!-- Uncomment if use part log.
         Part log contains information about all actions with parts in MergeTree tables (creation, deletion, merges, downloads).
    <part_log>
        <database>system</database>
        <table>part_log</table>
        <flush_interval_milliseconds>7500</flush_interval_milliseconds>
    </part_log>
    -->
    <compression incl="clickhouse_compression">
        <!--
        <!- - Set of variants. Checked in order. Last matching case wins. If nothing matches, lz4 will be used. - ->
        <case>

            <!- - Conditions. All must be satisfied. Some conditions may be omitted. - ->
            <min_part_size>10000000000</min_part_size>        <!- - Min part size in bytes. - ->
            <min_part_size_ratio>0.01</min_part_size_ratio>   <!- - Min size of part relative to whole table size. - ->

            <!- - What compression method to use. - ->
            <method>zstd</method>
        </case>
    -->
    </compression>
    <!-- Allow to execute distributed DDL queries (CREATE, DROP, ALTER, RENAME) on cluster.
         Works only if ZooKeeper is enabled. Comment it if such functionality isn't required. -->
    <distributed_ddl>
        <!-- Path in ZooKeeper to queue with DDL queries -->
        <path>/clickhouse/task_queue/ddl</path>
        <!-- Settings from this profile will be used to execute DDL queries -->
        <!-- <profile>default</profile> -->
    </distributed_ddl>
    <!-- Settings to fine tune MergeTree tables. See documentation in source code, in MergeTreeSettings.h -->
    <!--
    <merge_tree>
        <max_suspicious_broken_parts>5</max_suspicious_broken_parts>
    </merge_tree>
    -->

    <merge_tree>
        <parts_to_delay_insert>5000</parts_to_delay_insert>
        <parts_to_throw_insert>5000</parts_to_throw_insert>
        <max_delay_to_insert>2</max_delay_to_insert>
        <max_suspicious_broken_parts>5</max_suspicious_broken_parts>
        <max_parts_in_total>100000</max_parts_in_total>
    </merge_tree>
    
 

    





    <!-- Protection from accidental DROP.
         If size of a MergeTree table is greater than max_table_size_to_drop (in bytes) than table could not be dropped with any DROP query.
         If you want do delete one table and don't want to change clickhouse-server config, you could create special file <clickhouse-path>/flags/force_drop_table and make DROP once.
         By default max_table_size_to_drop is 50GB; max_table_size_to_drop=0 allows to DROP any tables.
         The same for max_partition_size_to_drop.
         Uncomment to disable protection.
    -->
    <!-- <max_table_size_to_drop>0</max_table_size_to_drop> -->
    <!-- <max_partition_size_to_drop>0</max_partition_size_to_drop> -->
    <!-- Please do not remove this line. -->
    <listen_host>0.0.0.0</listen_host>
    <zookeeper incl="zookeeper-servers" optional="true"/>
    <macros incl="macros" optional="true"/>
    <include_from>/etc/clickhouse-server/metrika.xml</include_from>
    <max_table_size_to_drop>0</max_table_size_to_drop>
</yandex>

metrika.xml 配置

xml 复制代码
<?xml version="1.0" encoding="UTF-8"?>
<yandex>
    <clickhouse_remote_servers>
        <default_cluster>
            <shard>
                <internal_replication>true</internal_replication>
                <replica>
                    <host>127.0.1.2</host>
                    <port>9000</port>
                    <user>xxx</user>
                    <password>xxx</password>
                </replica>
                <replica>
                    <host>127.0.0.3</host>
                    <port>9000</port>
                    <user>xxx</user>
                    <password>xxx</password>
                </replica>
            </shard>
        </default_cluster>
    </clickhouse_remote_servers>
    <zookeeper-servers>
        <node>
            <host>127.1.1.15</host>
            <port>2181</port>
        </node>
        <node>
            <host>127.1.1.16</host>
            <port>2181</port>
        </node>
        <node>
            <host>127.1.1.17</host>
            <port>2181</port>
        </node>
    </zookeeper-servers>
</yandex>

users.xml配置

xml 复制代码
<yandex>
    <!-- Profiles of settings. -->
    <profiles>
        <!-- Default settings. -->
        <default>
            <!-- Maximum memory usage for processing single query, in bytes. -->
            <max_memory_usage>250000000000</max_memory_usage>
            <!--
             <max_memory_usage_for_all_queries>100000000000</max_memory_usage_for_all_queries>
             -->
            <!-- Use cache of uncompressed blocks of data. Meaningful only for processing many of very short queries. -->
            <use_uncompressed_cache>0</use_uncompressed_cache>
            <!-- How to choose between replicas during distributed query processing.
                  random - choose random replica from set of replicas with minimum number of errors
                  nearest_hostname - from set of replicas with minimum number of errors, choose replica
                   with minimum number of different symbols between replica's hostname and local hostname
                   (Hamming distance).
                  in_order - first live replica is chosen in specified order.
                  first_or_random - if first replica one has higher number of errors, pick a random one from replicas with minimum number of errors.
             -->
            <load_balancing>random</load_balancing>
            <max_partitions_per_insert_block>0</max_partitions_per_insert_block>
            <background_pool_size>32</background_pool_size>
            <max_compress_block_size>10485760</max_compress_block_size>
            <min_insert_block_size_rows>10000000</min_insert_block_size_rows>
            <min_insert_block_size_bytes>1024000000</min_insert_block_size_bytes>
        </default>
        <!-- Profile that allows only read queries. -->
        <readonly>
            <readonly>1</readonly>
        </readonly>
    </profiles>
    <!-- Users and ACL. -->
    <users>
        <!-- If user name was not specified, 'default' user is used. -->
        <root>
            <!-- Password could be specified in plaintext or in SHA256 (in hex format).

                  If you want to specify password in plaintext (not recommended), place it in 'password' element.
                  Example: <password>qwerty</password>.
                  Password could be empty.

                  If you want to specify SHA256, place it in 'password_sha256_hex' element.
                  Example: <password_sha256_hex>65e84be33532fb784c48129675f9eff3a682b27168c0ea744b2cf58ee02337c5</password_sha256_hex>
                  Restrictions of SHA256: impossibility to connect to ClickHouse using MySQL JS client (as of July 2019).

                  If you want to specify double SHA1, place it in 'password_double_sha1_hex' element.
                  Example: <password_double_sha1_hex>e395796d6546b1b65db9d665cd43f0e858dd4303</password_double_sha1_hex>

                  How to generate decent password:
                  Execute: PASSWORD=$(base64 < /dev/urandom | head -c8); echo "$PASSWORD"; echo -n "$PASSWORD" | sha256sum | tr -d '-'
                  In first line will be password and in second - corresponding SHA256.

                  How to generate double SHA1:
                  Execute: PASSWORD=$(base64 < /dev/urandom | head -c8); echo "$PASSWORD"; echo -n "$PASSWORD" | sha1sum | tr -d '-' | xxd -r -p | sha1sum | tr -d '-'
                  In first line will be password and in second - corresponding double SHA1.
             -->
            <password>xxxxx</password>
            <!-- List of networks with open access.

                  To open access from everywhere, specify:
                     <ip>::/0</ip>

                  To open access only from localhost, specify:
                     <ip>::1</ip>
                     <ip>127.0.0.1</ip>

                  Each element of list has one of the following forms:
                  <ip> IP-address or network mask. Examples: 213.180.204.3 or 10.0.0.1/8 or 10.0.0.1/255.255.255.0
                      2a02:6b8::3 or 2a02:6b8::3/64 or 2a02:6b8::3/ffff:ffff:ffff:ffff::.
                  <host> Hostname. Example: server01.yandex.ru.
                      To check access, DNS query is performed, and all received addresses compared to peer address.
                  <host_regexp> Regular expression for host names. Example, ^serverdd-dd-d.yandex.ru$
                      To check access, DNS PTR query is performed for peer address and then regexp is applied.
                      Then, for result of PTR query, another DNS query is performed and all received addresses compared to peer address.
                      Strongly recommended that regexp is ends with $
                  All results of DNS requests are cached till server restart.
             -->
            <networks incl="networks" replace="replace">
                <ip>::/0</ip>
            </networks>
            <!-- Settings profile for user. -->
            <profile>default</profile>
            <!-- Quota for user. -->
            <quota>default</quota>
            <access_management>1</access_management>
        </default>
    </users>
    <!-- Quotas. -->
    <quotas>
        <!-- Name of quota. -->
        <default>
            <!-- Limits for time interval. You could specify many intervals with different limits. -->
            <interval>
                <!-- Length of interval. -->
                <duration>3600</duration>
                <!-- No limits. Just calculate resource usage for time interval. -->
                <queries>0</queries>
                <errors>0</errors>
                <result_rows>0</result_rows>
                <read_rows>0</read_rows>
                <execution_time>0</execution_time>
            </interval>
        </default>
    </quotas>
</yandex>
相关推荐
一只鹿鹿鹿4 分钟前
智慧能源大数据平台建设方案(PPT)
java·大数据·数据库·能源
深蓝易网16 分钟前
深度拆解!MES如何重构生产计划与排产调度全流程?
大数据·运维·人工智能·重构·架构·制造
intcube16 分钟前
集中运营、分散决策,寻找最佳财务规划的平衡点
大数据·信息可视化·数据分析·全面预算管理·财务管理·财务规划
时序数据说24 分钟前
IoTDB 分段查询语句深度剖析:GROUP BY 与时序语义的完美结合
大数据·数据库·开源·时序数据库·iotdb
Light601 小时前
Spark在大数据ETL中的应用:数据清洗与转换实战
大数据·spark·etl·数据清洗·数据转换
人大博士的交易之路2 小时前
今日行情明日机会——20250512
大数据·数学建模·数据挖掘·缠论·缠中说禅·涨停回马枪
庄小焱2 小时前
数据治理域——数据治理体系建设
大数据·数据治理·系统设计·数仓系统设计
芯盾时代2 小时前
数据出境的安全合规思考
大数据·人工智能·安全·网络安全·信息与通信
不学会Ⅳ3 小时前
【吃透 Elasticsearch 的核心原理】学习步骤
大数据·学习·elasticsearch
李昊哲小课5 小时前
tensorflow-cpu
大数据·人工智能·python·深度学习·数据分析·tensorflow