Teach novices how to use java to deduplicate the contents of a large text file
- 2021-09-24 22:39:40
- OfStack
How to write directories with memory overflow risk: Split writing through hashCode: Summary
Writing with memory overflow risk:
public static void distinct() {
File ff = new File("G://password/all.txt");
File distinctedFile = new File("G://password/all-distinced.txt");
PrintWriter pw = null;
Set<String> allHash = null;
FileReader fr = null;
BufferedReader br = null;
try {
pw = new PrintWriter(distinctedFile);
allHash = new HashSet<String>();
fr = new FileReader(ff);
br = new BufferedReader(fr);
String line = null;
while((line=br.readLine())!=null){
line = line.trim();
if(line != ""){
allHash.add(line);
}
}
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
} finally {
try {
if(null != fr){
fr.close();
}
} catch (IOException e) {
e.printStackTrace();
}
try {
if(null != br){
br.close();
}
} catch (IOException e) {
e.printStackTrace();
}
}
for(String s:allHash){
pw.println(s);
}
pw.close();
}
jvm memory overflow:
Exception in thread "main" java.lang.OutOfMemoryError: GC overhead limit exceeded
at java.util.HashMap.newNode(HashMap.java:1734)
at java.util.HashMap.putVal(HashMap.java:630)
at java.util.HashMap.put(HashMap.java:611)
at java.util.HashSet.add(HashSet.java:219)
at encode.Main.distinct(Main.java:180)
at encode.Main.main(Main.java:215)
By hashCode mode taking and splitting writing:
import java.io.BufferedReader;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
import java.io.PrintWriter;
import java.util.HashSet;
import java.util.Set;
public class DistinctFileUtil {
/**
* Will file hash After taking the mold, put it in different small files
* @param targetFile File path to be deduplicated
* @param splitSize How many copies are the object files cut into hash Number of small files modularized
* @return
*/
public static File[] splitFile(String targetFile,int splitSize){
File file = new File(targetFile);
BufferedReader reader = null;
PrintWriter[] pws = new PrintWriter[splitSize];
File[] littleFiles = new File[splitSize];
String parentPath = file.getParent();
File tempFolder = new File(parentPath + File.separator + "test");
if(!tempFolder.exists()){
tempFolder.mkdir();
}
for(int i=0;i<splitSize;i++){
littleFiles[i] = new File(tempFolder.getAbsolutePath() + File.separator + i + ".txt");
if(littleFiles[i].exists()){
littleFiles[i].delete();
}
try {
pws[i] = new PrintWriter(littleFiles[i]);
} catch (FileNotFoundException e) {
e.printStackTrace();
}
}
try {
reader = new BufferedReader(new FileReader(file));
String tempString = null;
while ((tempString = reader.readLine()) != null) {
tempString = tempString.trim();
if(tempString != ""){
// The key is to put each row of data hash After taking the module, put it in the file corresponding to the module value to ensure that hash Strings with the same value are all in the same 1 Inside a file
int index = Math.abs(tempString.hashCode() % splitSize);
pws[index].println(tempString);
}
}
} catch (Exception e) {
e.printStackTrace();
} finally {
if (reader != null) {
try {
reader.close();
} catch (IOException e1) {
e1.printStackTrace();
}
}
for(int i=0;i<splitSize;i++){
if(pws[i] != null){
pws[i].close();
}
}
}
return littleFiles;
}
/**
* De-duplicate merge small files
* @param littleFiles Small file array after cutting
* @param distinctFilePath File path after de-duplication
* @param splitSize Small file size
*/
public static void distinct(File[] littleFiles,String distinctFilePath,int splitSize){
File distinctedFile = new File(distinctFilePath);
FileReader[] frs = new FileReader[splitSize];
BufferedReader[] brs = new BufferedReader[splitSize];
PrintWriter pw = null;
try {
if(distinctedFile.exists()){
distinctedFile.delete();
}
distinctedFile.createNewFile();
pw = new PrintWriter(distinctedFile);
Set<String> unicSet = new HashSet<String>();
for(int i=0;i<splitSize;i++){
if(littleFiles[i].exists()){
System.out.println(" Start working on small files: " + littleFiles[i].getName() + " Weight removal ");
frs[i] = new FileReader(littleFiles[i]);
brs[i] = new BufferedReader(frs[i]);
String line = null;
while((line = brs[i].readLine())!=null){
if(line != ""){
unicSet.add(line);
}
}
for(String s:unicSet){
pw.println(s);
}
unicSet.clear();
System.gc();
}
}
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e1){
e1.printStackTrace();
} finally {
for(int i=0;i<splitSize;i++){
try {
if(null != brs[i]){
brs[i].close();
}
if(null != frs[i]){
frs[i].close();
}
} catch (IOException e) {
e.printStackTrace();
}
// Delete temporary small files after merge is complete
if(littleFiles[i].exists()){
littleFiles[i].delete();
}
}
if(null != pw){
pw.close();
}
}
}
public static void main(String[] args) throws IOException {
int splitSize = 20;
File[] files = splitFile("G://test/bigfile.txt",splitSize);
distinct(files,"G://test/bigfile-distinct.txt",splitSize);
}
}
Summarize
This is the content of this article, I hope you can like it, and I hope you can pay more attention to other wonderful content of this site!