package com.hsk.utils;
import cn.hutool.core.collection.CollUtil;
import cn.hutool.core.io.FileUtil;
import com.fasterxml.jackson.core.type.TypeReference;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.hsk.entity.CoCo.*;
import com.hsk.entity.voc.VocImage;
import com.hsk.entity.voc.VocItem;
import io.micrometer.common.util.StringUtils;
import lombok.extern.slf4j.Slf4j;
import javax.imageio.ImageIO;
import java.awt.image.BufferedImage;
import java.io.*;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.*;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.stream.Collectors;
/**
* @author: huangshikai
* @Date: 2024/5/2210:20
* @Description:数据集工具类
*/
@Slf4j
public class DataSetUtil {
// 图片路径
private static final String IMAGE_PATH = "images";
// 标注信息路径
private static final String ANNOTATION_PATH = "annotations";
/*
* @return void
* @Author HuangShiKai
* @Description ocr数据集转coco数据集
* @Date 2024/5/22 10:25
* @Param [labelTxt(标注信息文件), targetPath(输出路径)]
**/
public static void ocrDataSetToCoco(String labelTxtPath, String targetPath) {
Path labelPath = Path.of(labelTxtPath);
if (!Files.exists(labelPath) || !Files.isRegularFile(labelPath)) {
log.error("Label file does not exist or is not a regular file: {}", labelTxtPath);
return;
}
DataSet dataSet = new DataSet();
try {
// 获取Voc数据集
List<VocImage> vocImages = getVocDataSet(labelPath.toFile());
if (CollUtil.isNotEmpty(vocImages)) {
// 获取分类
Map<String, Integer> vocKeyCls = getVocKeyCls(vocImages);
// 解析Image
String imageParentPath = labelPath.getParent().toString();
dataSet.setImages(analysisImages(vocImages, imageParentPath));
dataSet.setAnnotations(analysisAnnotations(vocImages, vocKeyCls));
dataSet.setCategories(analysisCategories(vocKeyCls));
//转为json
ObjectMapper objectMapper = new ObjectMapper();
String json = objectMapper.writeValueAsString(dataSet);
// 创建目录
createDir(targetPath);
//创建json文件
String jsonDir = targetPath + File.separator + "annotations";
Path jsonPath = Path.of(jsonDir);
Files.write(jsonPath.resolve("coco.json"), json.getBytes());
// 复制图片
copyImage(imageParentPath, targetPath, dataSet.getImages());
}
} catch (Exception e) {
log.error("Error parsing label file: {}", labelTxtPath, e);
return;
}
}
/*
* @return java.util.List<com.hsk.entity.voc.VocImage>
* @Author HuangShiKai
* @Description 将Voc数据集转为对象
* @Date 2024/5/22 13:56
* @Param [labelTxtPath(labelTxt文件路径)]
**/
public static List<VocImage> getVocDataSet(File labelTxtPath) {
List<VocImage> vocImages = new ArrayList<>();
ObjectMapper objectMapper = new ObjectMapper();
// 读取label文件
try (BufferedReader bufferedReader = new BufferedReader(new FileReader(labelTxtPath));) {
String line;
AtomicInteger id = new AtomicInteger(1);
while ((line = bufferedReader.readLine()) != null) {
String fileName = line.split("\t")[0];
String itemsJson = line.split("\t")[1];
if (StringUtils.isNotBlank(fileName) && StringUtils.isNotBlank(itemsJson)) {
// 处理文件名,只保留文件名部分
fileName = fileName.substring(fileName.lastIndexOf("/") + 1);
VocImage vocImage = new VocImage();
vocImage.setFileName(fileName);
vocImage.setId(id.getAndIncrement());
// 将标签信息反序列化为对象
vocImage.setItems(objectMapper.readValue(itemsJson, new TypeReference<>() {
}));
vocImages.add(vocImage);
}
}
} catch (Exception e) {
log.error("读取Voc数据集失败");
}
return vocImages;
}
/*
* @return java.util.Map<java.lang.String,java.lang.Integer>
* @Author HuangShiKai
* @Description //把分类放入内存
* @Date 2024/5/22 16:29
* @Param [vocImages]
**/
public static Map<String, Integer> getVocKeyCls(List<VocImage> vocImages) {
Map<String, Integer> vocKeyCls = new HashMap<>();
// 用于生成唯一索引,从1开始
AtomicInteger index = new AtomicInteger(1);
vocImages.parallelStream()
.flatMap(vocImage -> vocImage.getItems().parallelStream())
.forEach(vocItem -> {
// 获取分类
vocKeyCls.computeIfAbsent(vocItem.getKeyCls(), k -> index.getAndIncrement());
});
return vocKeyCls;
}
/*
* @return java.util.List<com.hsk.entity.CoCo.Image>
* @Author HuangShiKai
* @Description 解析Images
* @Date 2024/5/22 16:49
* @Param [vocImages]
**/
public static List<Image> analysisImages(List<VocImage> vocImages, String imageParentPath) {
if (vocImages == null || vocImages.isEmpty()) {
return Collections.emptyList();
}
List<Image> images = new ArrayList<>();
try {
// 获取图片名及id
images = vocImages.parallelStream()
.map(vocImage -> {
Image image = new Image();
// 从map中获取id
image.setId(vocImage.getId());
image.setFileName(vocImage.getFileName());
BufferedImage bufferedImage = null;
try {
String imagePath = imageParentPath + File.separator + vocImage.getFileName();
bufferedImage = ImageIO.read(new File(imagePath));
image.setHeight(bufferedImage.getHeight());
image.setWidth(bufferedImage.getWidth());
} catch (IOException e) {
log.error("读取图片信息失败: {}", vocImage.getFileName(), e);
}
return image;
}).collect(Collectors.toList());
} catch (Exception e) {
log.error("解析Images失败:", e);
}
return images;
}
/*
* @return java.util.List<com.hsk.entity.CoCo.Annotation>
* @Author HuangShiKai
* @Description 解析Annotations
* @Date 2024/5/22 18:42
* @Param [vocImages]
**/
private static List<Annotation> analysisAnnotations(List<VocImage> vocImages, Map<String, Integer> vocKeyCls) {
List<Annotation> annotations = new ArrayList<>();
try {
AtomicInteger id = new AtomicInteger(1);
vocImages.forEach(vocImage -> {
// 根据文件名分组,获取vocimage对象
Map<String, List<VocItem>> vocItemMap = vocImage.getItems().parallelStream().collect(Collectors.groupingBy(VocItem::getKeyCls));
vocItemMap.forEach((keyCls, vocItems) -> {
Annotation annotation = new Annotation();
annotation.setId(id.getAndIncrement());
// 图片id
annotation.setImageId(vocImage.getId());
// 分类id
annotation.setCategoryId(vocKeyCls.get(keyCls));
ExtraInfo extraInfo = new ExtraInfo();
extraInfo.setLab
没有合适的资源?快使用搜索试试~ 我知道了~
资源推荐
资源详情
资源评论
收起资源包目录
PaddleDemo-master.zip (16个子文件)
PaddleDemo-master
pom.xml 2KB
src
test
java
com
hsk
PaddleDemoApplicationTests.java 3KB
main
resources
application.properties 35B
java
com
hsk
utils
PaddleServingUtil.java 4KB
DataSetUtil.java 13KB
PaddleDemoApplication.java 320B
entity
voc
VocItem.java 761B
VocImage.java 480B
CoCo
DataSet.java 867B
ExtraInfo.java 545B
Annotation.java 1KB
Categories.java 479B
Image.java 756B
LabelValue.java 560B
LICENSE 11KB
.gitignore 297B
共 16 条
- 1
资源评论
Java程序员-张凯
- 粉丝: 1w+
- 资源: 7353
上传资源 快速赚钱
- 我的内容管理 展开
- 我的资源 快来上传第一个资源
- 我的收益 登录查看自己的收益
- 我的积分 登录查看自己的积分
- 我的C币 登录后查看C币余额
- 我的收藏
- 我的下载
- 下载帮助
最新资源
资源上传下载、课程学习等过程中有任何疑问或建议,欢迎提出宝贵意见哦~我们会及时处理!
点击此处反馈
安全验证
文档复制为VIP权益,开通VIP直接复制
信息提交成功