How does Java API Batch Import Data to Hive

2021-11-02 01:02:47
OfStack

Java API implements batch import of data to Hive

If the data generated in Java program is imported into oracle or mysql library, it can be completed through jdbc connecting insert batch operation, but the current version of hive does not support batch insert operation, because the result data needs to be written into hdfs file first and then inserted into Hive table.


package com.enn.idcard; 
import java.io.IOException;
import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.SQLException;
import java.sql.Statement;
import java.util.ArrayList;
import java.util.List;
 
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
/**
 * <p>Description: </p>
 * @author kangkaia
 * @date 2017 Year 12 Month 26 Day   Afternoon 1:42:24
 */
public class HiveJdbc { 
    public static void main(String[] args) throws IOException {
    	List<List> argList = new ArrayList<List>();
		List<String> arg = new ArrayList<String>();
		arg.add("12345");
		arg.add("m");
		argList.add(arg);
		arg = new ArrayList<String>();
		arg.add("54321");
		arg.add("f");
		argList.add(arg);
//		System.out.println(argList.toString());
		String dst = "/test/kk.txt";
		createFile(dst,argList);
		loadData2Hive(dst);
    }
 
    /**
     *  Insert data into hdfs In, used for load To hive Table, the default separator is "\001"
     * @param dst
     * @param contents
     * @throws IOException
     */
    public static void createFile(String dst , List<List> argList) throws IOException{
        Configuration conf = new Configuration();
        FileSystem fs = FileSystem.get(conf);
        Path dstPath = new Path(dst); // Target path 
        // Open 1 Output streams 
        FSDataOutputStream outputStream = fs.create(dstPath);
        StringBuffer sb = new StringBuffer();
        for(List<String> arg:argList){
			for(String value:arg){
				sb.append(value).append("\001");
			}
			sb.deleteCharAt(sb.length() - 4);// Remove the last 1 Delimiter 
			sb.append("\n");
		}
        sb.deleteCharAt(sb.length() - 2);// Remove the last 1 Newline character 
        byte[] contents =  sb.toString().getBytes();
        outputStream.write(contents);
        outputStream.close();
        fs.close();
        System.out.println(" File created successfully! ");        
    }
    /**
     *  Will HDFS Documents load To hive In the table 
     * @param dst
     */
    public static void loadData2Hive(String dst) {
    	String JDBC_DRIVER = "org.apache.hive.jdbc.HiveDriver";
    	String CONNECTION_URL = "jdbc:hive2://server-13:10000/default;auth=noSasl";
    	String username = "admin";
        String password = "admin";
        Connection con = null;
		
		try {
			Class.forName(JDBC_DRIVER);
			con = (Connection) DriverManager.getConnection(CONNECTION_URL,username,password);
			Statement stmt = con.createStatement();			
			String sql = " load data inpath '"+dst+"' into table population.population_information ";
			
			stmt.execute(sql);
			System.out.println("loadData To Hive Table success! ");
		} catch (SQLException e) {
			e.printStackTrace();
		} catch (ClassNotFoundException e) {
			e.printStackTrace();
		}finally {
			//  Shut down rs , ps And con
			if(con != null){
				try {
					con.close();
				} catch (SQLException e) {
					e.printStackTrace();
				}
			}
		}
	}    
}

Note:

This example is built using mvn, and the conf configuration file is placed in the src/main/resources directory.

The default file storage formats provided by Hive are textfile, sequencefile, rcfile and so on. Users can also customize the file format of input through the implementation interface.

In practical applications, textfile is rarely used because of its non-compression, high disk and parsing overhead. Sequencefile is a binary format stored in key-value pairs that supports compression for record level and block level. rcfile is a column-column storage mode (text file and sequencefile are both row tables [row table]), which ensures that the same record is stored in the same hdfs block, and the blocks are stored in columns. 1 Generally speaking, For OLTP, Row tables have more advantages than lists, For OLAP, The advantage of list is greater than that of row table, It is particularly easy to think that the complexity of the list will be much smaller than that of the row table when it is used as an aggregation operation. Although the column operation of rcfile alone always exists, the high compression ratio of rcfile does reduce the file size. Therefore, in practical application, rcfile always becomes the choice of no 2, and the philosophical data platform also chooses a large number of rcfile schemes when selecting file storage formats.

The tables imported into hive through hdfs are in textfile format by default, so the storage format can be changed. The specific method is to create empty tables in sequencefile, rcfile and other formats first, and then reinsert data.


insert overwrite table seqfile_table select * from textfile_table; 
 ... 
insert overwrite table rcfile_table select * from textfile_table;

java Batch Insert hive Transfer HDFS

With a slight modification, this article is to save the data and load it into HIVE.

Analog data into HDFS and then loaded into HIVE, please remember to add HIVE JDBC dependency or error will be reported.

Before loading the data table had better use external table, otherwise drop table metadata will be deleted 1!


  <dependency>
   <groupId>org.apache.hive</groupId>
   <artifactId>hive-jdbc</artifactId>
   <version>1.1.0</version>
  </dependency>

Code


import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;
import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.SQLException;
import java.sql.Statement;
import java.util.ArrayList;
import java.util.List;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
public class Demo {
	    public static void main(String[] args) throws Exception {
	    	List<List> argList = new ArrayList<List>();
			List<String> arg = new ArrayList<String>();
			arg.add("12345");
			arg.add("m");
			argList.add(arg);
			arg = new ArrayList<String>();
			arg.add("54321");
			arg.add("f");
			argList.add(arg);
//			System.out.println(argList.toString());
			String dst = "/test/kk.txt";
			createFile(dst,argList);
//			loadData2Hive(dst);
	    }
	    /**
	     *  Insert data into hdfs In, used for load To hive Table, the default separator is "|"
	     * @param dst
	     * @param contents
	     * @throws IOException
	     * @throws Exception 
	     * @throws InterruptedException 
	     */
	    public static void createFile(String dst , List<List> argList) throws IOException, InterruptedException, Exception{
	        Configuration conf = new Configuration();
			FileSystem fs = FileSystem.get(new URI("hdfs://hadoop:9000"),conf,"root");
	        Path dstPath = new Path(dst); // Target path 
	        // Open 1 Output streams 
	        FSDataOutputStream outputStream = fs.create(dstPath);
	        StringBuffer sb = new StringBuffer();
	        for(List<String> arg:argList){
				for(String value:arg){
					sb.append(value).append("|");
				}
				sb.deleteCharAt(sb.length() - 1);// Remove the last 1 Delimiter 
				sb.append("\n");
			}
	        byte[] contents =  sb.toString().getBytes();
	        outputStream.write(contents);
			outputStream.flush();;
	        outputStream.close();
	        fs.close();
	        System.out.println(" File created successfully! ");
	        
	    }
	    /**
	     *  Will HDFS Documents load To hive In the table 
	     * @param dst
	     */
	    public static void loadData2Hive(String dst) {
	    	String JDBC_DRIVER = "org.apache.hive.jdbc.HiveDriver";
	    	String CONNECTION_URL = "jdbc:hive2://hadoop:10000/default";
	    	String username = "root";
	        String password = "root";
	        Connection con = null;
			
			try {
				Class.forName(JDBC_DRIVER);
				con = (Connection) DriverManager.getConnection(CONNECTION_URL,username,password);
				Statement stmt = con.createStatement();
				
				String sql = " load data inpath '"+dst+"' into table test ";//test  For the inserted table 
				
				stmt.execute(sql);
				System.out.println("loadData To Hive Table success! ");
			} catch (SQLException e) {
				e.printStackTrace();
			} catch (ClassNotFoundException e) {
				e.printStackTrace();
			}finally {
				//  Shut down rs , ps And con
				if(con != null){
					try {
						con.close();
					} catch (SQLException e) {
						e.printStackTrace();
					}
				}
			}
		}
	    
	}