Apache Hadoop (CDH 5) APIs
This tutorial will show how to use APIs to explore Cloudera's Distribution including Apache Hadoop (CDH 5), and how to deploy it on EC2 cluster. We're using one NameNode and three DataNodes.
We have four EC2 instances, one for Name node and three for Data nodes using Cloudera Manager 5.
For performance reason, we use r3-type, and the installed packages require some space, so we're using root size (>30GB) and additional volume (>60GB).
$ curl -u admin:admin 'http://ip-address:7180/api/v1/clusters' { "items" : [ { "name" : "Cluster 1", "version" : "CDH5" } ] }
Depending on the versions in the command, we have slightly different output:
$ curl -u admin:admin 'http://ip-address:7180/api/v1/clusters' { "items" : [ { "name" : "Cluster 1", "version" : "CDH5" } ] } $ curl -u admin:admin 'http://ip-address:7180/api/v2/clusters' { "items" : [ { "name" : "Cluster 1", "version" : "CDH5", "maintenanceMode" : false, "maintenanceOwners" : [ ] } ] } $ curl -u admin:admin 'http://ip-address:7180/api/v6/clusters' { "items" : [ { "name" : "cluster", "displayName" : "Cluster 1", "version" : "CDH5", "fullVersion" : "5.0.0", "maintenanceMode" : false, "maintenanceOwners" : [ ] } ] }
$ curl -u admin:admin \ 'http://ip-address:7180/api/v9/clusters/cluster/services' { "items" : [ { "name" : "hue", "type" : "HUE", "clusterRef" : { "clusterName" : "cluster" }, "serviceUrl" : "http://ec2-instance-name:7180/cmf/serviceRedirect/hue", "serviceState" : "STARTED", "healthSummary" : "NOT_AVAILABLE", "healthChecks" : [ ], "configStalenessStatus" : "FRESH", "clientConfigStalenessStatus" : "FRESH", "maintenanceMode" : false, "maintenanceOwners" : [ ], "displayName" : "Hue" }, { "name" : "oozie", "type" : "OOZIE", "clusterRef" : { "clusterName" : "cluster" }, "serviceUrl" : "http://ec2-instance-name:7180/cmf/serviceRedirect/oozie", "serviceState" : "STARTED", "healthSummary" : "NOT_AVAILABLE", "healthChecks" : [ ], "configStalenessStatus" : "FRESH", "clientConfigStalenessStatus" : "FRESH", "maintenanceMode" : false, "maintenanceOwners" : [ ], "displayName" : "Oozie" }, { "name" : "zookeeper", "type" : "ZOOKEEPER", "clusterRef" : { "clusterName" : "cluster" }, "serviceUrl" : "http://ec2-instance-name:7180/cmf/serviceRedirect/zookeeper", "serviceState" : "STARTED", "healthSummary" : "NOT_AVAILABLE", "healthChecks" : [ ], "configStalenessStatus" : "FRESH", "clientConfigStalenessStatus" : "FRESH", "maintenanceMode" : false, "maintenanceOwners" : [ ], "displayName" : "ZooKeeper" }, { "name" : "hdfs", "type" : "HDFS", "clusterRef" : { "clusterName" : "cluster" }, "serviceUrl" : "http://ec2-instance-name:7180/cmf/serviceRedirect/hdfs", "serviceState" : "STARTED", "healthSummary" : "NOT_AVAILABLE", "healthChecks" : [ ], "configStalenessStatus" : "FRESH", "clientConfigStalenessStatus" : "FRESH", "maintenanceMode" : false, "maintenanceOwners" : [ ], "displayName" : "HDFS" }, { "name" : "spark", "type" : "SPARK", "clusterRef" : { "clusterName" : "cluster" }, "serviceUrl" : "http://ec2-instance-name:7180/cmf/serviceRedirect/spark", "serviceState" : "STARTED", "healthSummary" : "NOT_AVAILABLE", "healthChecks" : [ ], "configStalenessStatus" : "FRESH", "clientConfigStalenessStatus" : "FRESH", "maintenanceMode" : false, "maintenanceOwners" : [ ], "displayName" : "Spark (Standalone)" }, { "name" : "hbase", "type" : "HBASE", "clusterRef" : { "clusterName" : "cluster" }, "serviceUrl" : "http://ec2-instance-name:7180/cmf/serviceRedirect/hbase", "serviceState" : "STARTED", "healthSummary" : "NOT_AVAILABLE", "healthChecks" : [ ], "configStalenessStatus" : "FRESH", "clientConfigStalenessStatus" : "FRESH", "maintenanceMode" : false, "maintenanceOwners" : [ ], "displayName" : "HBase" }, { "name" : "mapreduce", "type" : "MAPREDUCE", "clusterRef" : { "clusterName" : "cluster" }, "serviceUrl" : "http://ec2-instance-name:7180/cmf/serviceRedirect/mapreduce", "serviceState" : "STARTED", "healthSummary" : "NOT_AVAILABLE", "healthChecks" : [ ], "configStalenessStatus" : "FRESH", "clientConfigStalenessStatus" : "FRESH", "maintenanceMode" : false, "maintenanceOwners" : [ ], "displayName" : "MapReduce" }, { "name" : "yarn", "type" : "YARN", "clusterRef" : { "clusterName" : "cluster" }, "serviceUrl" : "http://ec2-instance-name:7180/cmf/serviceRedirect/yarn", "serviceState" : "STARTED", "healthSummary" : "NOT_AVAILABLE", "healthChecks" : [ ], "configStalenessStatus" : "FRESH", "clientConfigStalenessStatus" : "FRESH", "maintenanceMode" : false, "maintenanceOwners" : [ ], "displayName" : "YARN (MR2 Included)" }, { "name" : "hive", "type" : "HIVE", "clusterRef" : { "clusterName" : "cluster" }, "serviceUrl" : "http://ec2-instance-name:7180/cmf/serviceRedirect/hive", "serviceState" : "STARTED", "healthSummary" : "NOT_AVAILABLE", "healthChecks" : [ ], "configStalenessStatus" : "FRESH", "clientConfigStalenessStatus" : "FRESH", "maintenanceMode" : false, "maintenanceOwners" : [ ], "displayName" : "Hive" } ] }
$ curl -u admin:admin \ 'http://ip-address:7180/api/v9/clusters/cluster/services/hdfs' { "name" : "hdfs", "type" : "HDFS", "clusterRef" : { "clusterName" : "cluster" }, "serviceUrl" : "http://ec2-instance-name:7180/cmf/serviceRedirect/hdfs", "serviceState" : "STARTED", "healthSummary" : "NOT_AVAILABLE", "healthChecks" : [ ], "configStalenessStatus" : "FRESH", "clientConfigStalenessStatus" : "FRESH", "maintenanceMode" : false, "maintenanceOwners" : [ ], "displayName" : "HDFS" }
To get the configuration for hdfs, we just append 'config' to the command we used above:
$ curl -u admin:admin 'http://ip-address:7180/api/v9/clusters/cluster/services/hdfs/config' { "items" : [ { "name" : "zookeeper_service", "value" : "zookeeper" } ] }
The full configuration view shows all parameters with description:
$ curl -u admin:admin \ 'http://ip-address:7180/api/v9/clusters/cluster/services/hdfs/config?view=full' { "items" : [ { "name" : "zookeeper_service", "value" : "zookeeper", "required" : false, "displayName" : "ZooKeeper Service", "description" : "Name of the ZooKeeper service that this HDFS service instance depends on", "relatedName" : "", "validationState" : "OK" }, { "name" : "httpfs_proxy_user_groups_list", "required" : false, "default" : "*", "displayName" : "HttpFS Proxy User Groups", "description" : "Comma-delimited list of groups to allow the HttpFS user to impersonate. The default '*' allows all groups. To disable entirely, use a string that doesn't correspond to a group name, such as '_no_group_'.", "relatedName" : "hadoop.proxyuser.httpfs.groups", "validationState" : "OK" }, { "name" : "hdfs_namenode_health_enabled", "required" : false, "default" : "true", "displayName" : "Active NameNode Role Health Check", "description" : "When computing the overall HDFS cluster health, consider the active NameNode's health", "relatedName" : "", "validationState" : "OK" }, { "name" : "dfs_client_use_datanode_hostname", "required" : false, "default" : "false", "displayName" : "Use DataNode Hostname", "description" : "Typically, HDFS clients and servers communicate by opening sockets via an IP address. In certain networking configurations, it is preferable to open sockets after doing a DNS lookup on the hostname. Enable this property to open sockets after doing a DNS lookup on the hostname. This property is supported in CDH3u4 or later deployments.", "relatedName" : "dfs.client.use.datanode.hostname", "validationState" : "OK" }, { "name" : "hdfs_active_namenode_detecton_window", "required" : false, "default" : "3", "displayName" : "Active NameNode Detection Window", "description" : "The tolerance window that will be used in HDFS service tests that depend on detection of the active NameNode.", "relatedName" : "", "validationState" : "OK" }, { ...
Ph.D. / Golden Gate Ave, San Francisco / Seoul National Univ / Carnegie Mellon / UC Berkeley / DevOps / Deep Learning / Visualization