Commit e741f194 by 石头

Merge remote-tracking branch 'origin/master'

2 parents 83564d06 ceef850a
...@@ -25,14 +25,18 @@ import com.dookay.coral.common.core.utils.lang.CollectionUtils; ...@@ -25,14 +25,18 @@ import com.dookay.coral.common.core.utils.lang.CollectionUtils;
import com.dookay.coral.common.core.utils.lang.StringUtils; import com.dookay.coral.common.core.utils.lang.StringUtils;
import lombok.extern.slf4j.Slf4j; import lombok.extern.slf4j.Slf4j;
import org.apache.commons.collections4.ListUtils; import org.apache.commons.collections4.ListUtils;
import org.apache.commons.collections4.set.ListOrderedSet; import org.apache.commons.lang3.tuple.ImmutablePair;
import org.apache.commons.lang3.tuple.Pair;
import org.json.JSONArray; import org.json.JSONArray;
import org.json.JSONException; import org.json.JSONException;
import org.json.JSONObject; import org.json.JSONObject;
import org.springframework.beans.factory.annotation.Autowired; import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.data.redis.core.StringRedisTemplate;
import org.springframework.stereotype.Component; import org.springframework.stereotype.Component;
import java.util.*; import java.util.*;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.TimeUnit;
import java.util.stream.Collectors; import java.util.stream.Collectors;
/** /**
...@@ -45,13 +49,23 @@ import java.util.stream.Collectors; ...@@ -45,13 +49,23 @@ import java.util.stream.Collectors;
@Slf4j @Slf4j
public final class AipUtilBean { public final class AipUtilBean {
private static final ConcurrentHashMap<String, Double> SCORE_MAP = new ConcurrentHashMap<>();
private static final String SCORE_KEY_PREFIX = "WORD_SCORE:";
private static final double CRITICAL_VALUE = 0.4D;
/** /**
* 内部错误 * 内部错误
*/ */
private static final String INTERNAL_ERROR = "282000"; private static final String INTERNAL_ERROR = "282000";
private final AipNlp aipNlp;
private final StringRedisTemplate template;
@Autowired @Autowired
private AipNlp aipNlp; public AipUtilBean(AipNlp aipNlp, StringRedisTemplate template) {
this.aipNlp = aipNlp;
this.template = template;
}
/** /**
* 抽取查询关键词 * 抽取查询关键词
...@@ -123,6 +137,7 @@ public final class AipUtilBean { ...@@ -123,6 +137,7 @@ public final class AipUtilBean {
/** /**
* 抽取关键词 * 抽取关键词
* *
* @param keyword
* @param document 文档 * @param document 文档
* @param size 关键词个数 * @param size 关键词个数
* @return * @return
...@@ -130,28 +145,38 @@ public final class AipUtilBean { ...@@ -130,28 +145,38 @@ public final class AipUtilBean {
* @author houkun * @author houkun
* @date 2017/12/6 * @date 2017/12/6
*/ */
public List<String> extractKeyWords(String document, int size) throws JSONException { public List<String> extractKeyWords(String keyword, String document, int size) throws JSONException {
List<LexerItem> lexerItems = getLexerItems(document); List<LexerItem> lexerItems = getLexerItems(document);
Map<String, Set<LexerItem>> itemMap = lexerItems.stream() List<String> keywords = lexerItems.parallelStream()
.filter(l -> LexerPosConst.inThis(l.getPos()) .filter(l -> LexerPosConst.inThis(l.getPos())
|| LexerNeConst.inThis(l.getNe())) || LexerNeConst.inThis(l.getNe()))
.collect(Collectors.groupingBy( .map(LexerItem::getItem)
LexerItem::getItem, .distinct()
Collectors.mapping(l -> l, Collectors.toSet()))); .filter(w -> doSimnet(w, keyword) > CRITICAL_VALUE)
ListOrderedSet<String> words = new ListOrderedSet<>(); .sorted(Comparator.comparingDouble(word -> Double.parseDouble(template.opsForValue().get(getScoreKey((String) word, keyword)))).reversed())
// 规则1 优先专有名词 .limit(size)
for (Map.Entry<String, Set<LexerItem>> entry : itemMap.entrySet()) { .collect(Collectors.toList());
Set<LexerItem> value = entry.getValue(); return keywords;
boolean isNe = value.stream().anyMatch(l -> LexerNeConst.inThis(l.getNe())); }
if (isNe) {
words.add(entry.getKey()); /**
} * 计算关键词相关
} *
// 规则2 其他名词按照出现次数 * @param keyword
itemMap.entrySet().stream().sorted(Comparator.comparing(e -> e.getValue().size())).forEach(e -> { * @param words
words.add(e.getKey()); * @return
}); * @author houkun
return words.stream().limit(size).collect(Collectors.toList()); * @date 2017/12/6
*/
public Map<Pair<String, String>, Double> calcKeywordsRelated(String keyword, List<String> words) {
Map<Pair<String, String>, Double> map = new HashMap<>(words.size());
words.parallelStream()
.forEach(word -> {
double score = doSimEmbedding(keyword, word);
map.put(new ImmutablePair<>(keyword, word), score);
}
);
return map;
} }
...@@ -172,7 +197,6 @@ public final class AipUtilBean { ...@@ -172,7 +197,6 @@ public final class AipUtilBean {
log.debug(s); log.debug(s);
JSONObject res = doLexer(s); JSONObject res = doLexer(s);
JSONArray items = res.getJSONArray("items"); JSONArray items = res.getJSONArray("items");
// log.debug(items.toString(2));
lexerItems = ListUtils.union(lexerItems, JSON.parseArray(items.toString(), LexerItem.class)); lexerItems = ListUtils.union(lexerItems, JSON.parseArray(items.toString(), LexerItem.class));
} }
return lexerItems; return lexerItems;
...@@ -226,6 +250,81 @@ public final class AipUtilBean { ...@@ -226,6 +250,81 @@ public final class AipUtilBean {
return res; return res;
} }
/**
* 计算两个词的相关程度
*
* @param s1
* @param s2
* @return
* @throws JSONException
* @author houkun
* @date 2017/12/6
*/
private double doSimEmbedding(String s1, String s2) {
String key = getScoreKey(s1, s2);
String scoreString = template.opsForValue().get(key);
if (StringUtils.isNotEmpty(scoreString)) {
return Double.parseDouble(scoreString);
}
try {
JSONObject res = aipNlp.wordSimEmbedding(s1, s2);
double score = 0;
boolean error = res.has("error_code");
if (error) {
String errorCode = null;
errorCode = res.getString("error_code");
if (INTERNAL_ERROR.equals(errorCode)) {
score = doSimEmbedding(s1, s2);
} else {
log.warn(res.toString());
score = doSimnet(s1, s2);
}
} else {
score = res.getDouble("score");
}
template.opsForValue().set(key, String.valueOf(score), 1, TimeUnit.HOURS);
return score;
} catch (JSONException e) {
return 0;
}
}
/**
* 旧 aip 计算两个词的相关程度
*
* @param s1
* @param s2
* @return
* @throws JSONException
* @author houkun
* @date 2017/12/6
*/
private double doSimnet(String s1, String s2) {
JSONObject res = aipNlp.simnet(s1, s2, new HashMap<>(0));
double score = 0;
try {
boolean error = res.has("error");
if (error) {
String errorCode = null;
errorCode = res.getString("error");
if (INTERNAL_ERROR.equals(errorCode)) {
score = doSimnet(s1, s2);
} else {
throw new ServiceException(errorCode);
}
} else {
score = res.getDouble("score");
}
return score;
} catch (JSONException e) {
return 0;
}
}
private String getScoreKey(String s1, String s2) {
return SCORE_KEY_PREFIX + s1 + ":" + s2;
}
/** /**
* 分割文档 * 分割文档
......
...@@ -48,14 +48,14 @@ public class BaiduDemo { ...@@ -48,14 +48,14 @@ public class BaiduDemo {
// System.out.println(res.toString(2)); // System.out.println(res.toString(2));
// //
// // 句法 // // 句法
HashMap<String, Object> option = new HashMap<>(); // HashMap<String, Object> option = new HashMap<>();
option.put("mode", 1); // option.put("mode", 1);
JSONObject des = client.depParser(text, option); // JSONObject des = client.depParser(text, option);
System.out.println(des.toString(2)); // System.out.println(des.toString(2));
JSONObject des1 = client.depParser(text1, option); // JSONObject des1 = client.depParser(text1, option);
System.out.print(des1.toString(2)); // System.out.print(des1.toString(2));
JSONObject des2 = client.depParser(text2, option); // JSONObject des2 = client.depParser(text2, option);
System.out.print(des2.toString(2)); // System.out.print(des2.toString(2));
// //
// // 相似 // // 相似
...@@ -64,9 +64,9 @@ public class BaiduDemo { ...@@ -64,9 +64,9 @@ public class BaiduDemo {
// JSONObject response = client.simnet("十九大","中国共产党第十九次全国代表大会", option1); // JSONObject response = client.simnet("十九大","中国共产党第十九次全国代表大会", option1);
// System.out.println(response.toString(2)); // System.out.println(response.toString(2));
// // 词向量 // 词向量
// JSONObject vect = client.wordEmbedding("习近平"); JSONObject vect = client.wordEmbedding("十九大");
// System.out.println(vect.toString(2)); System.out.println(vect.toString(2));
// //
// // 相似 // // 相似
......
...@@ -16,6 +16,7 @@ package com.dookay.cihai.core; ...@@ -16,6 +16,7 @@ package com.dookay.cihai.core;
import com.alibaba.fastjson.JSON; import com.alibaba.fastjson.JSON;
import com.dookay.cihai.core.aip.AipUtilBean; import com.dookay.cihai.core.aip.AipUtilBean;
import org.apache.commons.lang3.tuple.Pair;
import org.junit.Assert; import org.junit.Assert;
import org.junit.Test; import org.junit.Test;
import org.springframework.beans.factory.annotation.Autowired; import org.springframework.beans.factory.annotation.Autowired;
...@@ -70,9 +71,16 @@ public class BaiduTest extends CihaiCoreApplicationTests { ...@@ -70,9 +71,16 @@ public class BaiduTest extends CihaiCoreApplicationTests {
File file = resource.getFile(); File file = resource.getFile();
FileReader reader = new FileReader(file); FileReader reader = new FileReader(file);
String text = FileCopyUtils.copyToString(reader); String text = FileCopyUtils.copyToString(reader);
List<String> list = aipUtilBean.extractKeyWords(text, 10); List<String> list = aipUtilBean.extractKeyWords("中国共产党第十九次全国代表大会", text, 15);
System.out.println(JSON.toJSONString(list)); System.out.println(JSON.toJSONString(list));
Map<String, Long> map = aipUtilBean.extractNounWordsWithCount(text); Map<Pair<String, String>, Double> map = aipUtilBean.calcKeywordsRelated("中国共产党第十九次全国代表大会", list);
System.out.print(map.toString()); for (String s : list) {
Map<Pair<String, String>, Double> map1 = aipUtilBean.calcKeywordsRelated(s, list);
System.out.print(map1);
}
System.out.print(map);
// Map<String, Long> map = aipUtilBean.extractNounWordsWithCount(text);
// System.out.print(map.toString());
} }
} }
...@@ -15,7 +15,13 @@ spring.datasource.druid.filter.config.enabled=true ...@@ -15,7 +15,13 @@ spring.datasource.druid.filter.config.enabled=true
mapper.mappers=com.dookay.coral.common.core.persistence.Mapper mapper.mappers=com.dookay.coral.common.core.persistence.Mapper
mybatis.mapper-locations=classpath*:mapper/*.xml mybatis.mapper-locations=classpath*:mapper/*.xml
spring.redis.host=192.168.2.27
spring.redis.password=100001
aip.app-id=10486245 aip.app-id=10486245
aip.api-key=ws8qdxT51xm2qbWufxzRedI3 aip.api-key=ws8qdxT51xm2qbWufxzRedI3
aip.secret-key=8b6g9ZyR69dFl6aqYdIOGa4IbOGgkdjh aip.secret-key=8b6g9ZyR69dFl6aqYdIOGa4IbOGgkdjh
logging.level.com.dookay.cihai.core=debug logging.level.com.dookay.cihai.core=debug
Markdown is supported
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!