PSET5 Speller Huge Number of Misspelled Words

PSET5 Speller Huge Number of Misspelled Words - python-3.x

I have been running the solution but is getting a high number of misspelled words.
WORDS MISSPELLED: 15904 as compared to staff's WORDS MISSPELLED: 955
Other than that, the word count is accurate and the runtime is alright.
I suspect the problem might come from the check / load function but I am not sure what caused it.
Some other code implemented a "to lowercase" in the check function, but I thought the (strcasecmp) would have done the job of comparing between strings, disregarding the case.
Check Function
bool check(const char *word)
{
int hashInt = hash(word);
if (table[hashInt] == NULL)
{
return 1;
}
node *cursor = table[hashInt];
while (cursor != NULL)
{
int i = strcasecmp(cursor -> word, word);
if (i == 0)
{
return 0;
break;
}
cursor = cursor -> next;
}
return false;
}
Load Function
bool load(const char *dictionary)
{
FILE *file = fopen(dictionary, "r");
if (file == NULL)
{
printf("error opening file");
return 1;
}
char word [LENGTH + 1];
while (fscanf(file, "%s\n", word) != EOF)
{
int hashInt = hash(word);
node *n = malloc(sizeof(node));
if (n == NULL)
{
unload();
return 1;
}
if (table[hashInt] == NULL)
{
table[hashInt] = n;
}
else
{
n -> next = table[hashInt];
table[hashInt] = n;
}
strcpy(n -> word, word);
wordLoaded++;
}
fclose(file);
return true;
}
Hash Function
unsigned int hash(const char *word)
{
unsigned int hash = 0;
for (int i = 0, n = strlen(word); i < n; i++)
hash = (hash << 2) ^ word[i];
return hash % N;
return 0;
}

From the spec:
dictionary is assumed to be a file containing a list of lowercase
words
and
Your implementation of check must be case-insensitive.
This int i = strcasecmp(cursor -> word, word); looks like it would fulfill the requirement as long as this int hashInt = hash(word); was also case insensitive. Alas, it is not; hash("A") and hash("a") will return different values.

Related

CS50 speller pset 5 (accessed 1 byte that does not belong to me but can't find the byte)

After running my code through help50 Valgrind, I got the following error message:
==6830== Invalid read of size 1
==6830== at 0x4C33614: strcasecmp (in /usr/lib/valgrind/vgpreload_memcheck-amd64-linux.so)
==6830== by 0x401176: check (dictionary.c:52)
==6830== by 0x400CD9: main (speller.c:112)
Looks like you're trying to access 1 byte of memory that isn't yours? Did you try to index into an array beyond its bounds? Take a closer look at line 52 of dictionary.c.
I think it has something to do with my check function but line 52 is just an if statement and I can't figure out where I'm trying to access that 1 byte from.**
My code is below:
#include <stdbool.h>
#include <string.h>
#include <stdio.h>
#include <stdlib.h>
#include <ctype.h>
#include <strings.h>
#include "dictionary.h"
// Represents a node in a hash table
typedef struct node
{
char word[LENGTH + 1];
struct node *next;
}
node;
// Number of buckets in hash table
const unsigned int N = 1000;
//Number of words
unsigned int noWords = 0;
//Variable to check if dictionary loaded
bool isLoaded = false;
// Hash table
node *table[N];
// Returns true if word is in dictionary else false
bool check(const char *word)
{
//Changing letters to lower case because case insensitive
//Copy created because word argument is a constant. copy can be edited
int n = strlen(word) + 1;
char copy[LENGTH + 1];
for (int i = 0; i < n; i++)
{
copy[i] = tolower(word[i]);
}
// Add null terminator to end string
copy[n] = '\0';
//Hash the word to convert it to index and check if it's in any of the linked lists
int index = hash(copy);
if (table[index] != NULL) //Check if word is in linked list
{
for (node *cursor = table[index]; cursor != NULL; cursor = cursor -> next)
{
if (strcasecmp(cursor -> word, copy) == 0)
{
return true;
}
}
}
return false;
}
// Hashes word to a number
unsigned int hash(const char *word)
{
//Taken from http://www.cse.yorku.ca/~oz/hash.html (by djb2)
unsigned long h = 5381;
int c;
while ((c = *word++))
{
c = tolower(c);
h = (((h << 5) + h) + c) % N; /* hash * 33 + c*/
}
return h;
}
// Loads dictionary into memory, returning true if successful else false
bool load(const char *dictionary)
{
char word[LENGTH + 1];
//Open dictionary
FILE *f = fopen(dictionary, "r");
//Check if file can be opened
if (f == NULL)
{
printf("%s\n", "File cannot be opened!");
return false;
}
//Read strings from file
while (fscanf(f, "%s", word) != EOF)
{
noWords++;
node *newNodePointer = malloc(sizeof(node));
if (newNodePointer == NULL)
{
unload();
printf("Out of memory");
return false;
}
int index = hash(word);//hashtable is an array of linked list. index helps insert node into hashtable
strcpy(newNodePointer -> word, word);//Copies word from infile into new node's word field
if (table[index] == NULL)//Check if same word already exists in the bucket
{
newNodePointer -> next = NULL;
table[index] = newNodePointer;
}
else
{
newNodePointer -> next = table[index];
table[index] = newNodePointer;
}
free(newNodePointer);
}
fclose(f);
isLoaded = true;
return true;
}
// Returns number of words in dictionary if loaded else 0 if not yet loaded
unsigned int size(void)
{
if (isLoaded)
{
return noWords;
}
return 0;
}
// Unloads dictionary from memory, returning true if successful else false
bool unload(void)
{
//Check if there's even a loaded dictionary
if (!isLoaded)
{
return false;
}
//Iterate through hashtable
for (int i = 0; i < N; i++)
{
if (table[i] != NULL)
{
node *cursor = table[i];
while (cursor != NULL)
{
node *tmp = table[i]; //tmp pointer continues pointing at table[i] while cursor points at next item in hashtable
cursor = cursor -> next;
free(tmp);
}
}
}
return true;
}

The problem is from here in load: free(newNodePointer);. It just released the memory where the word and the next pointer are stored!

Longest length of valid parenthesis problem

I got stuck on this problem of finding the length of the longest valid parenthesis substring which either contains '(' or ')' . Actually there are many methods to solve this problem but i tried to go with finding the longest common substring (LCS) of the two strings..
I am getting the runtime error....
Line 1061: Char 9: runtime error: addition of unsigned offset to 0x7ffd2d443260 overflowed to 0x7ffd2d44325f (basic_string.h)
SUMMARY: UndefinedBehaviorSanitizer: undefined-behavior /usr/bin/../lib/gcc/x86_64-linux-gnu/8/../../../../include/c++/8/bits/basic_string.h:1070:9
Please check what is wrong in this....
class Solution {
public:
int LCS(string s1,string s2)
{
int m = s1.length();
int n = s2.length();
int dp[m+1][n+1];
memset(dp,0,sizeof(dp));
for(int i=0;i<=m;i++)
{
for(int j=0;j<=n;j++)
dp[i][j] = 0;
}
int res = 0;
for(int i=0;i<=m;i++)
{
for(int j=0;j<=n;j++)
{
if(i==j)
dp[i][j] = 0;
else if(s1[i-1]==s2[j-1]){
dp[i][j] = 1 + dp[i-1][j-1];
res = max(res,dp[i][j]);
}
else
dp[i][j] = 0;
}
}
return res;
}
int longestValidParentheses(string s) {
// here we can find the longest LCS..
if(s.length()==0 || s.length()==1)
return 0;
string str = "";
for(int i=s.length()-1;i>=0;i--)
{
if(s[i]==')')
str += '(';
else if(s[i]=='(')
str += ')';
}
int res = LCS(s,str);
return res;
}
};

CS50 pset5 SPELLER - most basic words & substrings issues

I have been stuck in pset5 for a while now. No matter from which angle I look to my code I cannot fin out what is wrong with it. I set my number of buckets randomly to 1000. Can someone find out the problem?
Below is the reply I get from check50
:) dictionary.c, dictionary.h, and Makefile exist
:) speller compiles
:( handles most basic words properly
expected "MISSPELLED WOR...", not "MISSPELLED WOR..."
:) handles min length (1-char) words
:) handles max length (45-char) words
:) handles words with apostrophes properly
:) spell-checking is case-insensitive
:( handles substrings properly
expected "MISSPELLED WOR...", not "MISSPELLED WOR..."
:| program is free of memory errors
can't check until a frown turns upside down
these is what I've done:
const unsigned int N = 1000;
// Returns true if word is in dictionary else false
bool check(const char *word)
{
//convert *word to lowercase so that the hash function is case-insensitive
int length = strlen(word);
char copy[length + 1];
for (int i = 0; i < length; i++)
{
copy[i] = tolower(word[i]);
}
// create a variable to return hashed value of word
int index_check = hash(copy);
//create cursor to traverse the linked list
node *cursor = table[index_check];
//check if word is in the linked list
while (cursor != NULL)
{
if (strcasecmp(word, cursor->word) == 0)
{
return true;
}
cursor = cursor->next;
}
//return false if cursor->next = NULL has been reached
return false;
}
// Hashes word into a number
unsigned int hash(const char *word)
{
// Source of hash function: stackoverflow.com/questions/14409466/simple-hash-functions
unsigned int count;
unsigned int hashValue = 0;
for(count = 0; word[count] != '\0'; count++)
{
hashValue = word[count] + (hashValue << 6) + (hashValue << 16) - hashValue;
}
return (hashValue % N);
}
// Loads dictionary into memory, returning true if successful else false
bool load(const char *dictionary)
{
// open dictionary file
FILE *f = fopen(dictionary, "r");
if (f == NULL)
{
printf("Dictionary could not be opened\n");
return false;
}
//initialize string as a buffer, to be used in next function, fscanf
char buffer[LENGTH + 1];
//loop to check whether end of file has been reached
while (fscanf(f, "%s", buffer) != EOF)
{
//read words from file into buffer
fscanf(f, "%s", buffer);
//allocate memory for a node and check if NULL
node *n = malloc(sizeof(node));
if (n == NULL)
{
printf("Could not allocate memmory (malloc *n)\n");
return false;
}
//copy "buffer" into the node created
strcpy(n->word, buffer);
//call hash function
int index = hash(buffer);
//check if it's the first word being inserted into that bucket
if (table[index] == NULL)
{
table[index] = n;
}
else
{
n->next = table[index];
table[index] = n;
}
size_dictionary++;
}
fclose(f);
return true;
}
// Unloads dictionary from memory, returning true if successful else false
bool unload(void)
{
node *cursor;
node *tmp;
// run thru all buckets
for(int i = 0; i < N; i++)
{
//check if bucket isn't NULL
if(table[i] != NULL)
{
cursor = table[i];
tmp = cursor;
while (tmp != NULL)
{
cursor = cursor->next;
free(tmp);
tmp = cursor;
}
}
}
return true;
}

The clue is in WORDS IN DICTIONARY. Only half the words in dictionary are loaded. That is because of the back-to-back fscanf in load.

tiagoK, I think I'm having the same issue here and been stuck with this exercice for days. What did you mean by " the loop should have been done with lenght + 1", where did you declare this string ? I've tried it on the strcasecmp field and the hash one but it doesn't change anything. Do you remember ?
#include <ctype.h>
#include <stdbool.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <strings.h>
#include "dictionary.h"
// Represents a node in a hash table
typedef struct node
{
char word[LENGTH + 1];
struct node *next;
}
node;
// Choose number of buckets in hash table
const unsigned int N = 17576;
// Hash table
node *table[N];
// Initialize new variables used in the program
unsigned int numberofwords;
unsigned int hashvalue;
// Returns true if word is in dictionary, else false
bool check(const char *word)
{
// Hash the word to determine its hash value
hashvalue = hash(word);
// Set cursor to the head of the linked list of the word
node* cursor = table[hashvalue];
// Traverse into the linked list comparing the word to find a correspondance while the cursor isn't pointing to null
while (cursor != NULL)
{
if (strcasecmp(cursor->word, word) == 0)
{
return true;
}
cursor = cursor->next;
}
return false;
}
// Hashes word to a number
unsigned int hash(const char *word)
{
// Inittialize a variable to keep track of the total of ASCII values
unsigned int sum = 0;
// Loop in all the letters in the word
for (int i = 0 ; i < strlen(word) ; i++)
{
// Ignore digits
if (isdigit(word[i]))
{
continue;
}
else if (isalpha(word[i]) || word[i] == ('\''))
{
// Check the first three character in the word while setting them lower letters in order to get they ASCII value
sum += tolower(word[0] - 'a');
sum += tolower(word[1] - 'a');
sum += tolower(word[2] - 'a');
}
}
// Divide the total of the three letters by the number of buckets
hashvalue = sum % N;
return hashvalue;
}
// Loads dictionary into memory, returning true if successful, else false
bool load(const char *dictionary)
{
// Open the file
FILE* file = fopen(dictionary, "r");
char word[LENGTH + 1];
{
if (file != NULL)
{
// While dictionary doesn't return EOF, read strings from the file, one at the time
while (fscanf(file, "%s", word) != EOF)
{
// Create a new node for the word and copy it in the node
node *n = malloc(sizeof(node));
if (n == NULL)
{
return 1;
}
strcpy (n->word, "%s");
// Hash word to obtain a hash value
hashvalue = hash(word);
// Insert word into hash table depending if it's the first word or not
if (table[hashvalue] != NULL)
{
n->next = table[hashvalue];
}
else
{
n->next = NULL;
}
table[hashvalue] = n;
// Add one to the counter
numberofwords++;
}
fclose(file);
}
else
{
fclose(file);
perror("Loading error");
return 1;
}
}
// Return true
return true;
}
// Returns number of words in dictionary if loaded, else 0 if not yet loaded
unsigned int size(void)
{
// Return the number of words we're keeping track of while loading the dictionary
return numberofwords;
}
// Unloads dictionary from memory, returning true if successful, else false
bool unload(void)
{
// Check all the head of the hash table
for(int i = 0; i < N; i++)
{
// Create a temporary node to not lose the rest of the linked list
node* cursor = table[i];
// Set the cursor to the next node while the temporary value remains at the inital location, then free it before to move to cursor
while(cursor != NULL)
{
node* tmp = cursor;
cursor = cursor->next;
free(tmp);
}
}
return true;
}

Why can‘t I read the input text file?

I try to read the name of a file using scanf but failed.
I am very bad at pointers and could not find the problem.
Is there a problem with the pointer to the array of string?
Here is my code:
int* Read_file(char* str[])
{
FILE* fp = fopen(str[1], "r");
if(fp == NULL)
{
printf("File cannot open\n");
return NULL;
}
int rows = 0;
while(!feof(fp))
{
if(fgetc(fp) == '\n')
{
rows ++;
}
}
rows ++;
int* keys = (int*)malloc(3 * rows * sizeof(int));
fseek(fp, 0L, 0);
while(!feof(fp))
{
for(int i = 0;i < rows;i ++)
{
for(int j = 0;j < 3;j ++)
{
fscanf(fp,"%d", &keys[(3 * i) + j]);
}
}
}
fclose(fp);
return keys;
}
int main()
{
char* str[20];
printf("Build_tree ");
scanf("%s",&str);
int* keys = Read_file(str);
return 0;
}

Okay, so the thing is:
You need a char array to store a string(file-name). So you should use a char array. Instead, you were using an array of char pointers.
An array is actually a series of memory blocks. The name of the array represents a pointer to the first element of the array(in this case the first char variable).
While reading a string, scanf needs a location to store it. So you need to give it the address of the first char variable of your char array, which is available in your char array itself. So you have to pass str only to scanf. In the case of normal int,float, and such fundamental data types, their names represent memory blocks and not pointers to memory blocks, and hence you had to use a &.
Then for fopen, fopen expects a char*(which points to the first character of the char array stoing the filename) and you have to provide it with a char* . So you should pass str.
I think your code should go like
int* Read_file(char str[])
{
FILE* fp = fopen(str, "r");
if(fp == NULL)
{
printf("File cannot open\n");
return NULL;
}
int rows = 0;
while(!feof(fp))
{
if(fgetc(fp) == '\n')
{
rows ++;
}
}
rows ++;
int* keys = (int*)malloc(3 * rows * sizeof(int));
fseek(fp, 0L, 0);
while(!feof(fp))
{
for(int i = 0;i < rows;i ++)
{
for(int j = 0;j < 3;j ++)
{
fscanf(fp,"%d", &keys[(3 * i) + j]);
}
}
}
fclose(fp);
return keys;
}
int main()
{
char str[20];
printf("Build_tree ");
scanf("%s",str);
int* keys = Read_file(str);
//Whatever you want to do with the keys
return 0;
}
Comment for any queries.

Find the word in the stream?

Given an infinite stream of characters and a list L of strings, create a function that calls an external API when a word in L is recognized during the processing of the stream.
Example:
L = ["ok","test","one","try","trying"]
stream = a,b,c,o,k,d,e,f,t,r,y,i,n,g.............
The call to external API will happen when 'k' is encountered, again when the 'y' is encountered, and again at 'g'.
My idea:
Create trie out of the list and navigate the nodes as you read from stream in linear time. But there would be a bug if you just do simple trie search.
Assume you have words "abxyz" and "xyw" and your input is "abxyw".In this case you can't recognize "xyw" with trie.
So search should be modified as below:
let's take above use case "abxyw". We start the search and we find we have all the element till 'x'. Moment you get 'x' you have two options:
Check if the current element is equal to the head of trie and if it is equal to head of trie then call recursive search.
Continue till the end of current word. In this case for your given input it will return false but for the recursive search we started in point 1, it will return true.
Below is my modified search but I think it has bugs and can be improved. Any suggestions?
#define SIZE 26
struct tri{
int complete;
struct tri *child[SIZE];
};
void insert(char *c, struct tri **t)
{
struct tri *current = *t;
while(*c != '\0')
{
int i;
int letter = *c - 'a';
if(current->child[letter] == NULL) {
current->child[letter] = malloc(sizeof(*current));
memset(current->child[letter], 0, sizeof(struct tri));
}
current = current->child[letter];
c++;
}
current->complete = 1;
}
struct tri *t;
int flag = 0;
int found(char *c, struct tri *tt)
{
struct tri *current = tt;
if (current == NULL)
return 0;
while(*c != '\0')
{
int i;
int letter = *c - 'a';
/* if this is the first char then recurse from begining*/
if (t->child[letter] != NULL)
flag = found(c+1, t->child[letter]);
if (flag == 1)
return 1;
if(!flag && current->child[letter] == NULL) {
return 0;
}
current = current->child[letter];
c++;
}
return current->complete;
}
int main()
{
int i;
t = malloc(sizeof(*t));
t->complete = 0;
memset(t, 0, sizeof(struct tri));
insert("weathez", &t);
insert("eather", &t);
insert("weather", &t);
(1 ==found("weather", t))?printf("found\n"):printf("not found\n");
return 0;
}

What you want to do is exactly what Aho-Corasick algorithm does.
You can take a look at my Aho-Corasick implementation. It's contest-oriented, so maybe not focused on readability but I think it's quite clear:
typedef vector<int> VI;
struct Node {
int size;
Node *fail, *output;
VI id;
map<char, Node*> next;
};
typedef pair<Node*, Node*> P;
typedef map<char, Node*> MCP;
Node* root;
inline void init() {
root = new Node;
root->size = 0;
root->output = root->fail = NULL;
}
Node* add(string& s, int u, int c = 0, Node* p = root) {
if (p == NULL) {
p = new Node;
p->size = c;
p->fail = p->output = NULL;
}
if (c == s.size()) p->id.push_back(u);
else {
if (not p->next.count(s[c])) p->next[s[c]] = NULL;
p->next[s[c]] = add(s, u, c + 1, p->next[s[c]]);
}
return p;
}
void fill_fail_output() {
queue<pair<char, P> > Q;
for (MCP::iterator it=root->next.begin();
it!=root->next.end();++it)
Q.push(pair<char, P> (it->first, P(root, it->second)));
while (not Q.empty()) {
Node *pare = Q.front().second.first;
Node *fill = Q.front().second.second;
char c = Q.front().first; Q.pop();
while (pare != root && !pare->fail->next.count(c))
pare=pare->fail;
if (pare == root) fill->fail = root;
else fill->fail = pare->fail->next[c];
if (fill->fail->id.size() != 0)
fill->output = fill->fail;
else fill->output = fill->fail->output;
for (MCP::iterator it=fill->next.begin();
it!=fill->next.end();++it)
Q.push(pair<char,P>(it->first,P(fill,it->second)));
}
}
void match(int c, VI& id) {
for (int i = 0; i < id.size(); ++i) {
cout << "Matching of pattern " << id[i];
cout << " ended at " << c << endl;
}
}
void search(string& s) {
int i = 0, j = 0;
Node *p = root, *q;
while (j < s.size()) {
while (p->next.count(s[j])) {
p = p->next[s[j++]];
if (p->id.size() != 0) match(j - 1, p->id);
q = p->output;
while (q != NULL) {
match(j - 1, q->id);
q = q->output;
}
}
if (p != root) {
p = p->fail;
i = j - p->size;
}
else i = ++j;
}
}
void erase(Node* p = root) {
for (MCP::iterator it = p->next.begin();
it != p->next.end(); ++it)
erase(it->second);
delete p;
}
int main() {
init();
int n;
cin >> n;
for (int i = 0; i < n; ++i) {
string s;
cin >> s;
add(s, i);
}
fill_fail_output();
string text;
cin >> text;
search(text);
erase(root);
}

Develop Reference

node.js excel linux python-3.x azure haskell apache-spark rust .htaccess string

PSET5 Speller Huge Number of Misspelled Words - python-3.x

Related

CS50 speller pset 5 (accessed 1 byte that does not belong to me but can't find the byte)

Longest length of valid parenthesis problem

CS50 pset5 SPELLER - most basic words & substrings issues

Why can‘t I read the input text file?

Find the word in the stream?

Categories

Resources