{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 机器学习与社会科学应用"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 第四章 自然语言处理入门"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 第四节 文本相似度"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "<font face=\"宋体\" >郭峰    \n",
    "    教授、博士生导师  \n",
    "上海财经大学公共经济与管理学院  \n",
    "上海财经大学数实融合与智能治理实验室    \n",
    "    邮箱：guofengsfi@163.com</font> "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "<font face=\"宋体\" >本节目录  \n",
    "4.1.导入数据  \n",
    "4.2.将关键词处理成自定义词典  \n",
    "4.3.分词处理  \n",
    "4.4.建立语料库  \n",
    "4.5.文本相似度计算</font> "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 4.1. 导入数据"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import datetime\n",
    "starttime = datetime.datetime.now()\n",
    "\n",
    "path = \"D:/python/机器学习与社会科学应用/演示数据/04自然语言处理入门/tfidf相似度计算/\"\n",
    "cssci = pd.read_csv(path+\"cssci_clean_test.csv\",encoding='utf-8')\n",
    "print(cssci.shape)\n",
    "\n",
    "\n",
    "# 计算主题模型时，需要将标题、关键词和摘要合并\n",
    "cssci['keyword'] = cssci['keyword'].fillna(\";\")\n",
    "cssci['content'] = cssci['title']+\";\"+cssci['keyword']+\";\"+cssci['abstract']\n",
    "cssci=cssci[cssci['content'].str.len()>100]   # 将标题+关键词+摘要少于100字的样本删除\n",
    "print(\"标题+关键词+摘要少于100字的样本删除后数量:\"+str(len(cssci))) # 查看行*列数\n",
    "cssci.to_csv(path+'cssci_clean_short.csv',encoding='utf8',index=False)\n",
    "print(cssci.shape)\n",
    "print(cssci.year.min())\n",
    "endtime = datetime.datetime.now()\n",
    "print((endtime - starttime).seconds)\n",
    "cssci.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 4.2. 将关键词处理成自定义词典"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 根据关键词为分词准备自定义词典\n",
    "import jieba\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "import datetime\n",
    "starttime = datetime.datetime.now()\n",
    "\n",
    "path =  \"D:/python/机器学习与社会科学应用/演示数据//04自然语言处理入门/tfidf相似度计算/\"\n",
    "\n",
    "cssci = pd.read_csv(path+\"cssci_clean_short.csv\",encoding='utf-8')\n",
    "# cssci = cssci[0:10000]\n",
    "print(cssci.shape)\n",
    "\n",
    "# 去掉一些关键词较为特殊的样本\n",
    "# 关键词不能为空，且长度不超过30字符，早期系统自动识别的关键词数量较多\n",
    "cssci = cssci[cssci['kwnum']<6]\n",
    "cssci = cssci[cssci['keyword'].str.len()>1]\n",
    "cssci = cssci[cssci['keyword'].str.len()<30]\n",
    "\n",
    "keyword = cssci[['keyword']]\n",
    "print(keyword[0:20])\n",
    "print(\"包含正常关键词的论文数量：\"+str(len(keyword)))\n",
    "\n",
    "# 一行变多行\n",
    "keyword = keyword['keyword'].str.split(';', expand=True).stack()\n",
    "keyword.to_csv(path+'keyword.csv',encoding='utf8',index=False)\n",
    "f2 = open(path+\"keyword.csv\",encoding='utf-8')\n",
    "keyword = pd.read_csv(f2,header=0,sep=',')\n",
    "keyword.rename(columns={'0':'keyword'}, inplace = True)\n",
    "\n",
    "\n",
    "# 删除空值\n",
    "keyword = keyword.dropna() \n",
    "print(keyword[0:20])\n",
    "\n",
    "print(\"关键词累计总数量：\"+str(len(keyword)))\n",
    "\n",
    "\n",
    "# 去掉一些过长或者过短的关键词\n",
    "keyword = keyword[keyword['keyword'].str.len()>1]\n",
    "keyword = keyword[keyword['keyword'].str.len()<7]\n",
    "\n",
    "print(\"剔除过长过短关键词后数量：\"+str(len(keyword)))\n",
    "\n",
    "# 统计关键词重复出现的次数\n",
    "group1 = keyword.groupby(['keyword'])\n",
    "keyword_count = pd.DataFrame(columns=[\"keyword_count\"])\n",
    "keyword_count['keyword_count'] = group1['keyword'].count()\n",
    "keyword_count.to_csv(path+'keyword_count.csv',encoding='utf8')\n",
    "\n",
    "f = open(path+\"keyword_count.csv\",encoding='utf-8')\n",
    "keyword_count = pd.read_csv(f,header=0,sep=',')\n",
    "keyword = pd.merge(keyword,keyword_count,how='left')\n",
    "\n",
    "# 删除重复值\n",
    "keyword.drop_duplicates(subset=['keyword'],keep='first',inplace=True) \n",
    "print(\"删除重复后的关键词个数：\",len(keyword))\n",
    "\n",
    "keyword.to_csv(path+'keyword_count.csv',encoding='utf8',index=False)\n",
    "keyword = keyword[keyword['keyword_count']>1]\n",
    "keyword = keyword[['keyword']]\n",
    "print(\"剔除仅出现1次的关键词后数量：\"+str(len(keyword)))\n",
    "keyword.to_csv(path+'keyword.txt',encoding='utf8',index=False,header=False)\n",
    "\n",
    "endtime = datetime.datetime.now()\n",
    "print((endtime - starttime).seconds)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 4.3. 分词处理"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 重新生成关键词词典\n",
    "# 自定义词典格式：词 词频 词性（可省略）\n",
    "from collections import Counter\n",
    "\n",
    "path = \"D:/python/机器学习与社会科学应用/演示数据/04自然语言处理入门/tfidf相似度计算/\"\n",
    "\n",
    "keywords = open(path+\"keyword.txt\", encoding='utf8').read()\n",
    "keywords = keywords.strip().split('\\n')\n",
    "keywords = dict(Counter(keywords))\n",
    "with open(path+'keywords.txt','w',encoding='utf8') as f:\n",
    "    for key, value in keywords.items():\n",
    "        ele = key + \" \" + str(value) + '\\n'\n",
    "        f.write(ele)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 分词，全部运行要一段时间\n",
    "import jieba\n",
    "import jieba.posseg as pseg\n",
    "import pandas as pd\n",
    "import re\n",
    "import numpy as np\n",
    "import datetime\n",
    "starttime = datetime.datetime.now()\n",
    "\n",
    "\n",
    "path = \"D:/python/机器学习与社会科学应用/演示数据/04自然语言处理入门/tfidf相似度计算/\"\n",
    "cssci = pd.read_csv(path+\"cssci_clean_short.csv\",encoding='utf-8')\n",
    "# cssci=cssci[0:100]\n",
    "\n",
    "# 把停用词做成字典\n",
    "jieba.load_userdict(path+\"keywords.txt\") # 加载自定义词典\n",
    "stopwords = {}\n",
    "fstop = open(path+'stopword.txt', 'r')\n",
    "for eachWord in fstop:\n",
    "    stopwords[eachWord.strip()] = eachWord.strip()\n",
    "fstop.close()\n",
    "\n",
    "#切词的函数\n",
    "def word_cut(x):\n",
    "    line = x['content'].strip()\n",
    "    line1 = re.sub(\"[0-9\\s+\\.\\!\\/_,$%^*()?;；:-【】+\\\"\\']+|[+——！，;:。？、~@#￥%……&*（）]+\", \"\",line)\n",
    "    wordList = list(jieba.cut(line1)) # 用结巴分词，对每行内容进行分词  \n",
    "    outStr = ''  \n",
    "    for word in wordList:\n",
    "        if word not in stopwords:  \n",
    "            outStr += word  \n",
    "            outStr += ' '  \n",
    "    return outStr\n",
    "cssci['cut_out'] = cssci.apply(word_cut, axis=1)\n",
    "\n",
    "    \n",
    "print(cssci['title'][0])\n",
    "print(cssci['cut_out'][0])\n",
    "cssci[\"cutlength\"] = cssci['cut_out'].str.len()\n",
    "cssci = cssci[cssci['cutlength'] >2] # 分词之后，部分出现空值等异常现象\n",
    "\n",
    "cut_out = cssci[['cut_out']]\n",
    "cssci.to_csv(path+'cssci_title_cut.csv',encoding='utf8',index=False)\n",
    "cut_out.to_csv(path+'cut_out.csv',encoding='utf8')\n",
    "print(cssci.shape)\n",
    "endtime = datetime.datetime.now()\n",
    "print((endtime - starttime).seconds)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 4.4. 建立语料库"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from gensim import corpora,models,similarities\n",
    "from collections import  defaultdict\n",
    "import pandas as pd\n",
    "import re\n",
    "import numpy as np\n",
    "import datetime\n",
    "starttime = datetime.datetime.now()\n",
    "\n",
    "# 函数：建立语料库\n",
    "def get_dict(cutwords):\n",
    "    #print(cutwords[0])\n",
    "    texts = [cutword.split() for cutword in cutwords]\n",
    "    frequency = defaultdict(int)\n",
    "    for text in texts:\n",
    "        for token in text:\n",
    "            frequency[token] += 1\n",
    "    texts = [ [ token for token in text if frequency[token] > 5 ] for text in texts]\n",
    "    dictionary = corpora.Dictionary(texts) \n",
    "    corpus = [dictionary.doc2bow(text) for text in texts]\n",
    "    # print(corpus[0])\n",
    "    return dictionary,corpus\n",
    "\n",
    "\n",
    "# 导入数据，已经完成了分词模式\n",
    "path = \"D:/python/机器学习与社会科学应用/演示数据/04自然语言处理入门/tfidf相似度计算/\"\n",
    "cssci = pd.read_csv(path+\"cssci_title_cut.csv\", encoding='utf-8')\n",
    "# cssci=cssci[0:100]\n",
    "print(\"cssci样本量：\", len(cssci))\n",
    "\n",
    "# 计算dictionary,corpu\n",
    "cutwords = cssci['cut_out']\n",
    "dictionary,corpus = get_dict(cutwords)\n",
    "tfidf = models.TfidfModel(corpus)\n",
    "\n",
    "print(\"dictionary样本量：\", len(dictionary))\n",
    "\n",
    "# 模型结果保存\n",
    "tfidf.save(path+\"model.tfidf\")\n",
    "dictionary.save(path+'dictionary_tfidf.dict')  # 保存生成的词典\n",
    "\n",
    "\n",
    "endtime = datetime.datetime.now()\n",
    "print((endtime - starttime).seconds)\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 4.5.文本相似度计算"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true,
    "tags": []
   },
   "outputs": [],
   "source": [
    "# 这里的相似度是计算某个文章与上年所有top 5%论文的相似度，求其最大值；\n",
    "from gensim import corpora,models,similarities\n",
    "from collections import  defaultdict\n",
    "import pandas as pd\n",
    "import re\n",
    "import numpy as np\n",
    "import datetime\n",
    "starttime = datetime.datetime.now()\n",
    "\n",
    "def tfidf_sim(text1,text2,dictionary):\n",
    "    # 文档1\n",
    "    # text1 = text1.split()\n",
    "    text1 = [cutword.split() for cutword in text1]\n",
    "    # print(text1[0])\n",
    "    # corpus1 = dictionary.doc2bow(text1)  # 文档转换成bow\n",
    "    corpus1 = [dictionary.doc2bow(text) for text in text1]  # 文档转换成bow\n",
    "    # print(corpus1[0])\n",
    "    # corpus1 = [corpus1]\n",
    "    text1_tfidf = tfidf[corpus1]\n",
    "    tfidf_sim = similarities.SparseMatrixSimilarity(text1_tfidf, num_features=len(dictionary.keys()))\n",
    "\n",
    "    # 文档2\n",
    "    text2 = text2.split()\n",
    "    # print(text2)\n",
    "    corpus2 = dictionary.doc2bow(text2)  # 文档转换成bow\n",
    "    text2_tfidf = tfidf[corpus2]\n",
    "    sim = tfidf_sim[text2_tfidf]\n",
    "    sim2 = sorted(enumerate(sim), key=lambda item: -item[1])\n",
    "    # print(sim2[0])\n",
    "    return sim2[0][0],sim2[0][1]   #sim2是一个元组组成的列表，第一个为最大值及其对应的序号，详见上文第一小节\n",
    "\n",
    "    \n",
    "# 导入数据，已经完成了分词模式\n",
    "path = \"D:/python/机器学习与社会科学应用/演示数据/04自然语言处理入门/tfidf相似度计算/\"\n",
    "f = open(path+\"cssci_title_cut.csv\", encoding='utf-8')\n",
    "cssci = pd.read_csv(f,header=0, sep=',')\n",
    "print(\"样本量：\", len(cssci))\n",
    "\n",
    "# 计算dictionary,corpu\n",
    "tfidf = models.TfidfModel.load(path+\"model.tfidf\")\n",
    "dictionary = corpora.Dictionary.load(path+'dictionary_tfidf.dict')  # 加载\n",
    "\n",
    "cssci['sim'] = \"\"\n",
    "cssci['nearest_title'] = \"\"\n",
    "\n",
    "cssci_2001 = cssci[cssci.year==2001]\n",
    "cssci_new = cssci_2001   \n",
    "\n",
    "for year in range(2002,2018):\n",
    "    cssci_highcited = cssci[cssci['year']==year-1]\n",
    "    cssci_highcited['cp95'] = cssci_highcited['cited'].quantile(0.95)\n",
    "    cssci_highcited = cssci_highcited[cssci_highcited['cited']>=cssci_highcited['cp95']]\n",
    "    cssci_highcited['index'] = range(cssci_highcited.shape[0])  # 之前的index序号不连贯了,重新整理\n",
    "    cssci_highcited.set_index('index',inplace=True)\n",
    "    cssci_nextyear = cssci[cssci['year']==year]\n",
    "    cssci_nextyear['index'] = range(cssci_nextyear.shape[0])  # 之前的index序号不连贯了,重新整理\n",
    "    cssci_nextyear.set_index('index',inplace=True)\n",
    "    # cssci_nextyear=cssci_nextyear[0:10]\n",
    "    text1 = cssci_highcited['cut_out']\n",
    "    # 计算某年论文与上一年top5%最相似论文\n",
    "    def fun1(x):\n",
    "        text2 = x['cut_out']\n",
    "        j,sim = tfidf_sim(text1,text2,dictionary)\n",
    "        x['sim'] = sim\n",
    "        x['nearest_title'] = cssci_highcited['title'][j]\n",
    "        return x  \n",
    "    cssci_nextyear = cssci_nextyear.apply(fun1, axis=1)\n",
    "    print(cssci_nextyear['sim'][0:10])\n",
    "    print(cssci_nextyear.title[0:10],cssci_nextyear.nearest_title[0:10])\n",
    "    cssci_new = cssci_new.append(cssci_nextyear) \n",
    "\n",
    "cssci_new.to_csv(path+'cssci_sim_tfidf.csv',encoding='utf8')\n",
    "\n",
    "cssci_new_short = cssci_new[['tlength','mag_name','mag_city_code','aunum','author_first','aufw','cited','download','fund01','fundn','page_num','year_period','year','month','kwnum','ablength','page_beg','sim']] \n",
    "cssci_new_short.to_csv(path+'cssci_sim_tfidf_short.csv',encoding='utf8',index=False)\n",
    "endtime = datetime.datetime.now()\n",
    "print((endtime - starttime).seconds)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 本节结束"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
